date:20230814

LGTM



juzhe.zh...@rivai.ai
 
From: Li, Pan2
Date: 2023-08-14 15:01
To: juzhe.zh...@rivai.ai
Subject: FW: [PATCH v1] RISC-V: Support RVV VFWNMACC rounding mode intrinsic API
Kindly ping.
 
Pan
 
-Original Message-
From: Li, Pan2  
Sent: Monday, August 14, 2023 10:36 AM
To: gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; Li, Pan2 ; Wang, Yanzhang 
; kito.ch...@gmail.com
Subject: [PATCH v1] RISC-V: Support RVV VFWNMACC rounding mode intrinsic API
 
From: Pan Li 
 
This patch would like to support the rounding mode API for the
VFWNMACC as the below samples.
 
* __riscv_vfwnmacc_vv_f64m2_rm
* __riscv_vfwnmacc_vv_f64m2_rm_m
* __riscv_vfwnmacc_vf_f64m2_rm
* __riscv_vfwnmacc_vf_f64m2_rm_m
 
Signed-off-by: Pan Li 
 
gcc/ChangeLog:
 
* config/riscv/riscv-vector-builtins-bases.cc
(class vfwnmacc_frm): New class for frm.
(vfwnmacc_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfwnmacc_frm): New intrinsic function definition.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/base/float-point-wnmacc.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 25 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 +
.../riscv/rvv/base/float-point-wnmacc.c   | 47 +++
4 files changed, 75 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wnmacc.c
 
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index e84d6d1d047..4a7f2b8e3e9 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -562,6 +562,29 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfwnmacc
+*/
+class vfwnmacc_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  bool has_merge_operand_p () const override { return false; }
+
+  rtx expand (function_expander &e) const override
+  {
+if (e.op_info->op == OP_TYPE_vf)
+  return e.use_widen_ternop_insn (
+ code_for_pred_widen_mul_neg_scalar (MINUS, e.vector_mode ()));
+if (e.op_info->op == OP_TYPE_vv)
+  return e.use_widen_ternop_insn (
+ code_for_pred_widen_mul_neg (MINUS, e.vector_mode ()));
+
+gcc_unreachable ();
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2340,6 +2363,7 @@ static CONSTEXPR const vfmsub_frm vfmsub_frm_obj;
static CONSTEXPR const vfwmacc vfwmacc_obj;
static CONSTEXPR const vfwmacc_frm vfwmacc_frm_obj;
static CONSTEXPR const vfwnmacc vfwnmacc_obj;
+static CONSTEXPR const vfwnmacc_frm vfwnmacc_frm_obj;
static CONSTEXPR const vfwmsac vfwmsac_obj;
static CONSTEXPR const vfwnmsac vfwnmsac_obj;
static CONSTEXPR const unop vfsqrt_obj;
@@ -2584,6 +2608,7 @@ BASE (vfmsub_frm)
BASE (vfwmacc)
BASE (vfwmacc_frm)
BASE (vfwnmacc)
+BASE (vfwnmacc_frm)
BASE (vfwmsac)
BASE (vfwnmsac)
BASE (vfsqrt)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index acbc7d42fbe..27c7deb4ec2 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -178,6 +178,7 @@ extern const function_base *const vfmsub_frm;
extern const function_base *const vfwmacc;
extern const function_base *const vfwmacc_frm;
extern const function_base *const vfwnmacc;
+extern const function_base *const vfwnmacc_frm;
extern const function_base *const vfwmsac;
extern const function_base *const vfwnmsac;
extern const function_base *const vfsqrt;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 0b73a5bcbc5..481c3b899f2 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -378,6 +378,8 @@ DEF_RVV_FUNCTION (vfwnmsac, alu, full_preds, f_wwfv_ops)
DEF_RVV_FUNCTION (vfwmacc_frm, alu_frm, full_preds, f_wwvv_ops)
DEF_RVV_FUNCTION (vfwmacc_frm, alu_frm, full_preds, f_wwfv_ops)
+DEF_RVV_FUNCTION (vfwnmacc_frm, alu_frm, full_preds, f_wwvv_ops)
+DEF_RVV_FUNCTION (vfwnmacc_frm, alu_frm, full_preds, f_wwfv_ops)
// 13.8. Vector Floating-Point Square-Root Instruction
DEF_RVV_FUNCTION (vfsqrt, alu, full_preds, f_v_ops)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wnmacc.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wnmacc.c
new file mode 100644
index 000..2602289ec88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wnmacc.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat64m2_t
+test_vfwnmacc_vv_f32m1_rm (vfloat64m2_t vd, vfloat32m1_t op1, vfloat32m1_t op2,
+size_t vl) {
+  return __riscv_vfwnmacc_vv_f64m2_rm (vd, op1, op2, 0, vl);
+}
+

RE: FW: [PATCH v1] RISC-V: Support RVV VFWNMACC rounding mode intrinsic API

Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Monday, August 14, 2023 3:03 PM
To: Li, Pan2 
Cc: gcc-patches ; kito.cheng 
Subject: Re: FW: [PATCH v1] RISC-V: Support RVV VFWNMACC rounding mode 
intrinsic API

LGTM


juzhe.zh...@rivai.ai

From: Li, Pan2
Date: 2023-08-14 15:01
To: juzhe.zh...@rivai.ai
Subject: FW: [PATCH v1] RISC-V: Support RVV VFWNMACC rounding mode intrinsic API
Kindly ping.

Pan

-Original Message-
From: Li, Pan2 mailto:pan2...@intel.com>>
Sent: Monday, August 14, 2023 10:36 AM
To: gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; Li, Pan2 
mailto:pan2...@intel.com>>; Wang, Yanzhang 
mailto:yanzhang.w...@intel.com>>; 
kito.ch...@gmail.com
Subject: [PATCH v1] RISC-V: Support RVV VFWNMACC rounding mode intrinsic API

From: Pan Li mailto:pan2...@intel.com>>

This patch would like to support the rounding mode API for the
VFWNMACC as the below samples.

* __riscv_vfwnmacc_vv_f64m2_rm
* __riscv_vfwnmacc_vv_f64m2_rm_m
* __riscv_vfwnmacc_vf_f64m2_rm
* __riscv_vfwnmacc_vf_f64m2_rm_m

Signed-off-by: Pan Li mailto:pan2...@intel.com>>

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class vfwnmacc_frm): New class for frm.
(vfwnmacc_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfwnmacc_frm): New intrinsic function definition.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-wnmacc.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 25 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 +
.../riscv/rvv/base/float-point-wnmacc.c   | 47 +++
4 files changed, 75 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wnmacc.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index e84d6d1d047..4a7f2b8e3e9 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -562,6 +562,29 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfwnmacc
+*/
+class vfwnmacc_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  bool has_merge_operand_p () const override { return false; }
+
+  rtx expand (function_expander &e) const override
+  {
+if (e.op_info->op == OP_TYPE_vf)
+  return e.use_widen_ternop_insn (
+ code_for_pred_widen_mul_neg_scalar (MINUS, e.vector_mode ()));
+if (e.op_info->op == OP_TYPE_vv)
+  return e.use_widen_ternop_insn (
+ code_for_pred_widen_mul_neg (MINUS, e.vector_mode ()));
+
+gcc_unreachable ();
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2340,6 +2363,7 @@ static CONSTEXPR const vfmsub_frm vfmsub_frm_obj;
static CONSTEXPR const vfwmacc vfwmacc_obj;
static CONSTEXPR const vfwmacc_frm vfwmacc_frm_obj;
static CONSTEXPR const vfwnmacc vfwnmacc_obj;
+static CONSTEXPR const vfwnmacc_frm vfwnmacc_frm_obj;
static CONSTEXPR const vfwmsac vfwmsac_obj;
static CONSTEXPR const vfwnmsac vfwnmsac_obj;
static CONSTEXPR const unop vfsqrt_obj;
@@ -2584,6 +2608,7 @@ BASE (vfmsub_frm)
BASE (vfwmacc)
BASE (vfwmacc_frm)
BASE (vfwnmacc)
+BASE (vfwnmacc_frm)
BASE (vfwmsac)
BASE (vfwnmsac)
BASE (vfsqrt)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index acbc7d42fbe..27c7deb4ec2 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -178,6 +178,7 @@ extern const function_base *const vfmsub_frm;
extern const function_base *const vfwmacc;
extern const function_base *const vfwmacc_frm;
extern const function_base *const vfwnmacc;
+extern const function_base *const vfwnmacc_frm;
extern const function_base *const vfwmsac;
extern const function_base *const vfwnmsac;
extern const function_base *const vfsqrt;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 0b73a5bcbc5..481c3b899f2 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -378,6 +378,8 @@ DEF_RVV_FUNCTION (vfwnmsac, alu, full_preds, f_wwfv_ops)
DEF_RVV_FUNCTION (vfwmacc_frm, alu_frm, full_preds, f_wwvv_ops)
DEF_RVV_FUNCTION (vfwmacc_frm, alu_frm, full_preds, f_wwfv_ops)
+DEF_RVV_FUNCTION (vfwnmacc_frm, alu_frm, full_preds, f_wwvv_ops)
+DEF_RVV_FUNCTION (vfwnmacc_frm, alu_frm, full_preds, f_wwfv_ops)
// 13.8. Vector Floating-Point Square-Root Instruction
DEF_RVV_FUNCTION (vfsqrt, alu, full_preds, f_v_ops)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float

RE: [PATCH v1] RISC-V: Support RVV VFWMSAC rounding mode intrinsic API

Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Monday, August 14, 2023 2:42 PM
To: Li, Pan2 ; gcc-patches 
Cc: Li, Pan2 ; Wang, Yanzhang ; 
kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Support RVV VFWMSAC rounding mode intrinsic API


LGTM

juzhe.zh...@rivai.ai

From: pan2.li
Date: 2023-08-14 11:29
To: gcc-patches
CC: juzhe.zhong; 
pan2.li; 
yanzhang.wang; 
kito.cheng
Subject: [PATCH v1] RISC-V: Support RVV VFWMSAC rounding mode intrinsic API
From: Pan Li mailto:pan2...@intel.com>>

This patch would like to support the rounding mode API for the
VFWMSAC as the below samples.

* __riscv_vfwmsac_vv_f64m2_rm
* __riscv_vfwmsac_vv_f64m2_rm_m
* __riscv_vfwmsac_vf_f64m2_rm
* __riscv_vfwmsac_vf_f64m2_rm_m

Signed-off-by: Pan Li mailto:pan2...@intel.com>>

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class vfwmsac_frm): New class for frm.
(vfwmsac_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfwmsac_frm): New intrinsic function definition.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-wmsac.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 25 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 +
.../riscv/rvv/base/float-point-wmsac.c| 47 +++
4 files changed, 75 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wmsac.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 4a7f2b8e3e9..5a5da903cb2 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -585,6 +585,29 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfwmsac
+*/
+class vfwmsac_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  bool has_merge_operand_p () const override { return false; }
+
+  rtx expand (function_expander &e) const override
+  {
+if (e.op_info->op == OP_TYPE_vf)
+  return e.use_widen_ternop_insn (
+ code_for_pred_widen_mul_scalar (MINUS, e.vector_mode ()));
+if (e.op_info->op == OP_TYPE_vv)
+  return e.use_widen_ternop_insn (
+ code_for_pred_widen_mul (MINUS, e.vector_mode ()));
+
+gcc_unreachable ();
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2365,6 +2388,7 @@ static CONSTEXPR const vfwmacc_frm vfwmacc_frm_obj;
static CONSTEXPR const vfwnmacc vfwnmacc_obj;
static CONSTEXPR const vfwnmacc_frm vfwnmacc_frm_obj;
static CONSTEXPR const vfwmsac vfwmsac_obj;
+static CONSTEXPR const vfwmsac_frm vfwmsac_frm_obj;
static CONSTEXPR const vfwnmsac vfwnmsac_obj;
static CONSTEXPR const unop vfsqrt_obj;
static CONSTEXPR const float_misc vfrsqrt7_obj;
@@ -2610,6 +2634,7 @@ BASE (vfwmacc_frm)
BASE (vfwnmacc)
BASE (vfwnmacc_frm)
BASE (vfwmsac)
+BASE (vfwmsac_frm)
BASE (vfwnmsac)
BASE (vfsqrt)
BASE (vfrsqrt7)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 27c7deb4ec2..09356dd7ac8 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -180,6 +180,7 @@ extern const function_base *const vfwmacc_frm;
extern const function_base *const vfwnmacc;
extern const function_base *const vfwnmacc_frm;
extern const function_base *const vfwmsac;
+extern const function_base *const vfwmsac_frm;
extern const function_base *const vfwnmsac;
extern const function_base *const vfsqrt;
extern const function_base *const vfrsqrt7;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 481c3b899f2..e2a79607d04 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -380,6 +380,8 @@ DEF_RVV_FUNCTION (vfwmacc_frm, alu_frm, full_preds, 
f_wwvv_ops)
DEF_RVV_FUNCTION (vfwmacc_frm, alu_frm, full_preds, f_wwfv_ops)
DEF_RVV_FUNCTION (vfwnmacc_frm, alu_frm, full_preds, f_wwvv_ops)
DEF_RVV_FUNCTION (vfwnmacc_frm, alu_frm, full_preds, f_wwfv_ops)
+DEF_RVV_FUNCTION (vfwmsac_frm, alu_frm, full_preds, f_wwvv_ops)
+DEF_RVV_FUNCTION (vfwmsac_frm, alu_frm, full_preds, f_wwfv_ops)
// 13.8. Vector Floating-Point Square-Root Instruction
DEF_RVV_FUNCTION (vfsqrt, alu, full_preds, f_v_ops)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wmsac.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wmsac.c
new file mode 100644
index 000..886a0b13695
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wmsac.c
@@ -0

Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

Thanks Richi.

CC kewen to see whether this patch is suitable for powerpc and s390.



juzhe.zh...@rivai.ai
 
From: Richard Biener
Date: 2023-08-14 14:53
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST 
vectorization
On Fri, 11 Aug 2023, juzhe.zh...@rivai.ai wrote:
 
> From: Ju-Zhe Zhong 
> 
> Hi, Richard and Richi.
> 
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
 
OK.
 
Thanks,
Richard.
 
> Consider this following case:
> 
> #include 
> 
> #define EXTRACT_LAST(TYPE) \
>   TYPE __attribute__ ((noinline, noclone)) \
>   test_##TYPE (TYPE *x, int n, TYPE value) \
>   { \
> TYPE last; \
> for (int j = 0; j < n; ++j) \
>   { \
> last = x[j]; \
> x[j] = last * value; \
>   } \
> return last; \
>   }
> 
> #define TEST_ALL(T) \
>   T (uint8_t) \
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI 
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> 
> Details of this approach:
> 
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live 
> vectorization
> for LEN loop control.
>
>This function we check whether target support:
> - Use LEN as the loop control.
> - Support VEC_EXTRACT optab.
> 
> 2. Step 2 - Record LEN for loop control if 
> 'vect_can_vectorize_extract_last_with_len_p' is true.
> 
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> 
> The only difference between mask and len is that len is using length 
> generated by SELECT_VL and
> use VEC_EXTRACT pattern. The rest of the live vectorization is totally the 
> same ARM SVE.
> 
> gcc/ChangeLog:
> 
> * tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.
> 
> ---
>  gcc/tree-vect-loop.cc | 78 ++-
>  1 file changed, 62 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bf8d677b584..a011e2dacb2 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>/* No transformation required.  */
>if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  {
> -   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -OPTIMIZE_FOR_SPEED))
> - {
> -   if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "can't operate on partial vectors "
> - "because the target doesn't support extract "
> - "last reduction.\n");
> -   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> - }
> -   else if (slp_node)
> +   if (slp_node)
>  {
>if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>else
>  {
>gcc_assert (ncopies == 1 && !slp_node);
> -   vect_record_loop_mask (loop_vinfo,
> -  &LOOP_VINFO_MASKS (loop_vinfo),
> -  1, vectype, NULL);
> +   if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +   OPTIMIZE_FOR_SPEED))
> + vect_record_loop_mask (loop_vinfo,
> +&LOOP_VINFO_MASKS (loop_vinfo),
> +1, vectype, NULL);
> +   else if (convert_optab_handler (vec_extract_optab,
> +   TYPE_MODE (vectype),
> +   TYPE_MODE (TREE_TYPE (vectype)))
> +!= CODE_FOR_nothing)
> + vect_record_loop_len (loop_vinfo,
> +   &LOOP_VINFO_LENS (loop_vinfo),
> +   1, vectype, 1);
> +   else
> + {
> +   if (dump_enabled_p ())
> + dump_printf_loc (
> +   MSG_MISSED_OPTIMIZATION, vect_location,
> +   "can't operate on partial vectors "
> +   "because the target doesn't support extract "
> +   "last reduction.\n");
> +   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + }
>  }
>  }
>/* ???  Enable for loop costing as well.  */
> @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>gimple *vec_stmt;
>if (slp_node)
>  {
> -  gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +  gcc_assert (!loop_vinf

RE: [PATCH v1] RISC-V: Support RVV VFWNMSAC rounding mode intrinsic API

Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Monday, August 14, 2023 2:43 PM
To: Li, Pan2 ; gcc-patches 
Cc: Li, Pan2 ; Wang, Yanzhang ; 
kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Support RVV VFWNMSAC rounding mode intrinsic API

LGTM


juzhe.zh...@rivai.ai

From: pan2.li
Date: 2023-08-14 14:07
To: gcc-patches
CC: juzhe.zhong; 
pan2.li; 
yanzhang.wang; 
kito.cheng
Subject: [PATCH v1] RISC-V: Support RVV VFWNMSAC rounding mode intrinsic API
From: Pan Li mailto:pan2...@intel.com>>

This patch would like to support the rounding mode API for the
VFWNMSAC as the below samples.

* __riscv_vfwnmsac_vv_f64m2_rm
* __riscv_vfwnmsac_vv_f64m2_rm_m
* __riscv_vfwnmsac_vf_f64m2_rm
* __riscv_vfwnmsac_vf_f64m2_rm_m

Signed-off-by: Pan Li mailto:pan2...@intel.com>>

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class vfwnmsac_frm): New class for frm.
(vfwnmsac_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfwnmsac_frm): New intrinsic function definition.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-wnmsac.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 25 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 +
.../riscv/rvv/base/float-point-wnmsac.c   | 47 +++
4 files changed, 75 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wnmsac.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 5a5da903cb2..b458560a040 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -608,6 +608,29 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfwnmsac
+*/
+class vfwnmsac_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  bool has_merge_operand_p () const override { return false; }
+
+  rtx expand (function_expander &e) const override
+  {
+if (e.op_info->op == OP_TYPE_vf)
+  return e.use_widen_ternop_insn (
+ code_for_pred_widen_mul_neg_scalar (PLUS, e.vector_mode ()));
+if (e.op_info->op == OP_TYPE_vv)
+  return e.use_widen_ternop_insn (
+ code_for_pred_widen_mul_neg (PLUS, e.vector_mode ()));
+
+gcc_unreachable ();
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2390,6 +2413,7 @@ static CONSTEXPR const vfwnmacc_frm vfwnmacc_frm_obj;
static CONSTEXPR const vfwmsac vfwmsac_obj;
static CONSTEXPR const vfwmsac_frm vfwmsac_frm_obj;
static CONSTEXPR const vfwnmsac vfwnmsac_obj;
+static CONSTEXPR const vfwnmsac_frm vfwnmsac_frm_obj;
static CONSTEXPR const unop vfsqrt_obj;
static CONSTEXPR const float_misc vfrsqrt7_obj;
static CONSTEXPR const float_misc vfrec7_obj;
@@ -2636,6 +2660,7 @@ BASE (vfwnmacc_frm)
BASE (vfwmsac)
BASE (vfwmsac_frm)
BASE (vfwnmsac)
+BASE (vfwnmsac_frm)
BASE (vfsqrt)
BASE (vfrsqrt7)
BASE (vfrec7)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 09356dd7ac8..85e8b9a3769 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -182,6 +182,7 @@ extern const function_base *const vfwnmacc_frm;
extern const function_base *const vfwmsac;
extern const function_base *const vfwmsac_frm;
extern const function_base *const vfwnmsac;
+extern const function_base *const vfwnmsac_frm;
extern const function_base *const vfsqrt;
extern const function_base *const vfrsqrt7;
extern const function_base *const vfrec7;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index e2a79607d04..7e2a4ab2969 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -382,6 +382,8 @@ DEF_RVV_FUNCTION (vfwnmacc_frm, alu_frm, full_preds, 
f_wwvv_ops)
DEF_RVV_FUNCTION (vfwnmacc_frm, alu_frm, full_preds, f_wwfv_ops)
DEF_RVV_FUNCTION (vfwmsac_frm, alu_frm, full_preds, f_wwvv_ops)
DEF_RVV_FUNCTION (vfwmsac_frm, alu_frm, full_preds, f_wwfv_ops)
+DEF_RVV_FUNCTION (vfwnmsac_frm, alu_frm, full_preds, f_wwvv_ops)
+DEF_RVV_FUNCTION (vfwnmsac_frm, alu_frm, full_preds, f_wwfv_ops)
// 13.8. Vector Floating-Point Square-Root Instruction
DEF_RVV_FUNCTION (vfsqrt, alu, full_preds, f_v_ops)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wnmsac.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-wnmsac.c
new file mode 100644
index 000..13eb306313c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/f

Re: [PATCH v1 1/6] LoongArch: a symmetric multilib subdir layout

2023-08-14 Thread Yujie Yang

On Mon, Aug 14, 2023 at 01:38:40PM +0800, Xi Ruoyao wrote:
> On Mon, 2023-08-14 at 11:57 +0800, Yang Yujie wrote:
> 
> > However, for LoongArch, we do not want such a "toplevel" library
> > installation since the default ABI may change.  We expect all
> > multilib variants of libraries to be installed to their designated
> > ABI-specific subdirs (e.g. base/lp64d) of the GCC libdir, so that
> > the default ABI can be configured arbitrarily (with --with-abi)
> > while the gcc libdir layout stays consistent.  This could be
> > helpful for the distribution packaging of GCC libraries.
> 
> Have you tested a --disable-multilib configuration?  To me with --
> disable-configuration everything should be still in the toplevel
> directory, not any sub-directory.

That's a good point, sorry I missed --disable-multilib here.

However, you don't really need --disable-multilib since
the libraries are only built once in the default ABI configuration
as long as --with-multilib-list does not request anything more than
that.

Maybe we should force-enabling multilib in all cases.

[PATCH v1] RISC-V: Support RVV VFSQRT rounding mode intrinsic API

2023-08-14 Thread Pan Li via Gcc-patches

From: Pan Li 

This patch would like to support the rounding mode API for the
VFSQRT as the below samples.

* __riscv_vfsqrt_v_f32m1_rm
* __riscv_vfsqrt_v_f32m1_rm_m

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class unop_frm): New class for frm.
(vfsqrt_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfsqrt_frm): New intrinsic function definition.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-sqrt.c: New test.
---
 .../riscv/riscv-vector-builtins-bases.cc  | 17 ++
 .../riscv/riscv-vector-builtins-bases.h   |  1 +
 .../riscv/riscv-vector-builtins-functions.def |  2 ++
 .../riscv/rvv/base/float-point-sqrt.c | 31 +++
 4 files changed, 51 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index b458560a040..2074dac0f16 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -631,6 +631,21 @@ public:
   }
 };
 
+/* Implements below instructions for frm
+   - vfsqrt
+*/
+template
+class unop_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  rtx expand (function_expander &e) const override
+  {
+return e.use_exact_insn (code_for_pred (CODE, e.vector_mode ()));
+  }
+};
+
 /* Implements vrsub.  */
 class vrsub : public function_base
 {
@@ -2415,6 +2430,7 @@ static CONSTEXPR const vfwmsac_frm vfwmsac_frm_obj;
 static CONSTEXPR const vfwnmsac vfwnmsac_obj;
 static CONSTEXPR const vfwnmsac_frm vfwnmsac_frm_obj;
 static CONSTEXPR const unop vfsqrt_obj;
+static CONSTEXPR const unop_frm vfsqrt_frm_obj;
 static CONSTEXPR const float_misc vfrsqrt7_obj;
 static CONSTEXPR const float_misc vfrec7_obj;
 static CONSTEXPR const binop vfmin_obj;
@@ -2662,6 +2678,7 @@ BASE (vfwmsac_frm)
 BASE (vfwnmsac)
 BASE (vfwnmsac_frm)
 BASE (vfsqrt)
+BASE (vfsqrt_frm)
 BASE (vfrsqrt7)
 BASE (vfrec7)
 BASE (vfmin)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 85e8b9a3769..5c91381bd4c 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -184,6 +184,7 @@ extern const function_base *const vfwmsac_frm;
 extern const function_base *const vfwnmsac;
 extern const function_base *const vfwnmsac_frm;
 extern const function_base *const vfsqrt;
+extern const function_base *const vfsqrt_frm;
 extern const function_base *const vfrsqrt7;
 extern const function_base *const vfrec7;
 extern const function_base *const vfmin;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 7e2a4ab2969..a821aca6a4b 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -388,6 +388,8 @@ DEF_RVV_FUNCTION (vfwnmsac_frm, alu_frm, full_preds, 
f_wwfv_ops)
 // 13.8. Vector Floating-Point Square-Root Instruction
 DEF_RVV_FUNCTION (vfsqrt, alu, full_preds, f_v_ops)
 
+DEF_RVV_FUNCTION (vfsqrt_frm, alu_frm, full_preds, f_v_ops)
+
 // 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
 DEF_RVV_FUNCTION (vfrsqrt7, alu, full_preds, f_v_ops)
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c
new file mode 100644
index 000..afd1fb2b8f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat32m1_t
+test_riscv_vfsqrt_vv_f32m1_rm (vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1_rm (op1, 0, vl);
+}
+
+vfloat32m1_t
+test_vfsqrt_vv_f32m1_rm_m (vbool32_t mask, vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1_rm_m (mask, op1, 1, vl);
+}
+
+vfloat32m1_t
+test_riscv_vfsqrt_vv_f32m1 (vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1 (op1, vl);
+}
+
+vfloat32m1_t
+test_vfsqrt_vv_f32m1_m (vbool32_t mask, vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1_m (mask, op1, vl);
+}
+
+/* { dg-final { scan-assembler-times {vfsqrt\.v\s+v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {frrm\s+[axs][0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {fsrm\s+[axs][0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {fsrmi\s+[01234]} 2 } } */
-- 
2.34.1

Re: [PATCH v1] RISC-V: Support RVV VFSQRT rounding mode intrinsic API

LGTM



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-08-14 15:39
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Support RVV VFSQRT rounding mode intrinsic API
From: Pan Li 
 
This patch would like to support the rounding mode API for the
VFSQRT as the below samples.
 
* __riscv_vfsqrt_v_f32m1_rm
* __riscv_vfsqrt_v_f32m1_rm_m
 
Signed-off-by: Pan Li 
 
gcc/ChangeLog:
 
* config/riscv/riscv-vector-builtins-bases.cc
(class unop_frm): New class for frm.
(vfsqrt_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfsqrt_frm): New intrinsic function definition.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/base/float-point-sqrt.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 17 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 ++
.../riscv/rvv/base/float-point-sqrt.c | 31 +++
4 files changed, 51 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c
 
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index b458560a040..2074dac0f16 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -631,6 +631,21 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfsqrt
+*/
+template
+class unop_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  rtx expand (function_expander &e) const override
+  {
+return e.use_exact_insn (code_for_pred (CODE, e.vector_mode ()));
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2415,6 +2430,7 @@ static CONSTEXPR const vfwmsac_frm vfwmsac_frm_obj;
static CONSTEXPR const vfwnmsac vfwnmsac_obj;
static CONSTEXPR const vfwnmsac_frm vfwnmsac_frm_obj;
static CONSTEXPR const unop vfsqrt_obj;
+static CONSTEXPR const unop_frm vfsqrt_frm_obj;
static CONSTEXPR const float_misc vfrsqrt7_obj;
static CONSTEXPR const float_misc vfrec7_obj;
static CONSTEXPR const binop vfmin_obj;
@@ -2662,6 +2678,7 @@ BASE (vfwmsac_frm)
BASE (vfwnmsac)
BASE (vfwnmsac_frm)
BASE (vfsqrt)
+BASE (vfsqrt_frm)
BASE (vfrsqrt7)
BASE (vfrec7)
BASE (vfmin)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 85e8b9a3769..5c91381bd4c 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -184,6 +184,7 @@ extern const function_base *const vfwmsac_frm;
extern const function_base *const vfwnmsac;
extern const function_base *const vfwnmsac_frm;
extern const function_base *const vfsqrt;
+extern const function_base *const vfsqrt_frm;
extern const function_base *const vfrsqrt7;
extern const function_base *const vfrec7;
extern const function_base *const vfmin;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 7e2a4ab2969..a821aca6a4b 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -388,6 +388,8 @@ DEF_RVV_FUNCTION (vfwnmsac_frm, alu_frm, full_preds, 
f_wwfv_ops)
// 13.8. Vector Floating-Point Square-Root Instruction
DEF_RVV_FUNCTION (vfsqrt, alu, full_preds, f_v_ops)
+DEF_RVV_FUNCTION (vfsqrt_frm, alu_frm, full_preds, f_v_ops)
+
// 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
DEF_RVV_FUNCTION (vfrsqrt7, alu, full_preds, f_v_ops)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c
new file mode 100644
index 000..afd1fb2b8f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat32m1_t
+test_riscv_vfsqrt_vv_f32m1_rm (vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1_rm (op1, 0, vl);
+}
+
+vfloat32m1_t
+test_vfsqrt_vv_f32m1_rm_m (vbool32_t mask, vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1_rm_m (mask, op1, 1, vl);
+}
+
+vfloat32m1_t
+test_riscv_vfsqrt_vv_f32m1 (vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1 (op1, vl);
+}
+
+vfloat32m1_t
+test_vfsqrt_vv_f32m1_m (vbool32_t mask, vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1_m (mask, op1, vl);
+}
+
+/* { dg-final { scan-assembler-times {vfsqrt\.v\s+v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {frrm\s+[axs][0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {fsrm\s+[axs][0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {fsrmi\s+[01234]} 2 } } */
-- 
2.34.1

Re: [PATCHv4, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769]

Hi Haochen,

on 2023/8/14 10:18, HAO CHEN GUI wrote:
> Hi,
>   This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx
> for all sub targets when the mode is V4SI and the extracted element is word
> 1 from BE order. Also this patch adds a insn pattern for mfvsrwz which
> helps eliminate redundant zero extend.
> 
>   Compared to last version, the main change is to put the word index
> checking in the split condition of "*vsx_extract_v4si_w023". Also modified
> some comments.
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625380.html
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> 
> Thanks
> Gui Haochen
> 
> ChangeLog
> rs6000: Generate mfvsrwz for all platform and remove redundant zero extend
> 
> mfvsrwz has lower latency than xxextractuw or vextuw[lr]x.  So it should be
> generated even with p9 vector enabled.  Also the instruction is already
> zero extended.  A combine pattern is needed to eliminate redundant zero
> extend instructions.
> 
> gcc/
>   PR target/106769
>   * config/rs6000/vsx.md (expand vsx_extract_): Set it only
>   for V8HI and V16QI.
>   (vsx_extract_v4si): New expand for V4SI extraction.
>   (vsx_extract_v4si_w1): New insn pattern for V4SI extraction on
>   word 1 from BE order.   
>   (*mfvsrwz): New insn pattern for mfvsrwz.
>   (*vsx_extract__di_p9): Assert that it won't be generated on
>   word 1 from BE order.
>   (*vsx_extract_si): Remove.
>   (*vsx_extract_v4si_w023): New insn and split pattern on word 0, 2,
>   3 from BE order.
> 
> gcc/testsuite/
>   PR target/106769
>   * gcc.target/powerpc/pr106769.h: New.
>   * gcc.target/powerpc/pr106769-p8.c: New.
>   * gcc.target/powerpc/pr106769-p9.c: New.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 0a34ceebeb5..1cbdc2f1c01 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2__1"
>  (define_expand  "vsx_extract_"
>[(parallel [(set (match_operand: 0 "gpc_reg_operand")
>  (vec_select:
> - (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
> + (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand")
>   (parallel [(match_operand:QI 2 "const_int_operand")])))
> -   (clobber (match_scratch:VSX_EXTRACT_I 3))])]
> +   (clobber (match_scratch:VSX_EXTRACT_I2 3))])]
>"VECTOR_MEM_VSX_P (mode) && TARGET_DIRECT_MOVE_64BIT"
>  {
>/* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}.  */
> @@ -3736,6 +3736,63 @@ (define_expand  "vsx_extract_"
>  }
>  })
> 
> +(define_expand  "vsx_extract_v4si"
> +  [(parallel [(set (match_operand:SI 0 "gpc_reg_operand")
> +(vec_select:SI
> + (match_operand:V4SI 1 "gpc_reg_operand")
> + (parallel [(match_operand:QI 2 "const_0_to_3_operand")])))
> +   (clobber (match_scratch:V4SI 3))])]
> +  "TARGET_DIRECT_MOVE_64BIT"
> +{
> +  /* The word 1 (BE order) can be extracted by mfvsrwz/stxsiwx.  So just
> + fall through to vsx_extract_v4si_w1.  */
> +  if (TARGET_P9_VECTOR
> +  && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))
> +{
> +  emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1],
> +   operands[2]));
> +  DONE;
> +}
> +})
> +
> +/* Extract from word 1 (BE order);  */

Nit: I guessed I requested this before, please use ";" instead of
"/* ... */" for the comments, to align with the existing ones.

> +(define_insn "vsx_extract_v4si_w1"
> +  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa")
> + (vec_select:SI
> +  (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0")
> +  (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
> +   (clobber (match_scratch:V4SI 3 "=v,v,v,v"))]
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
> +{
> +   if (which_alternative == 0)
> + return "mfvsrwz %0,%x1";
> +
> +   if (which_alternative == 1)
> + return "xxlor %x0,%x1,%x1";
> +
> +   if (which_alternative == 2)
> + return "stxsiwx %x1,%y0";
> +
> +   return ASM_COMMENT_START " vec_extract to same register";
> +}
> +  [(set_attr "type" "mfvsr,veclogical,fpstore,*")
> +   (set_attr "length" "4,4,4,0")
> +   (set_attr "isa" "p8v,*,p8v,*")])
> +
> +(define_insn "*mfvsrwz"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> + (zero_extend:DI
> +   (vec_select:SI
> + (match_operand:V4SI 1 "vsx_register_operand" "wa")
> + (parallel [(match_operand:QI 2 "const_int_operand" "n")]
> +   (clobber (match_scratch:V4SI 3 "=v"))]
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
> +  "mfvsrwz %0,%x1"
> +  [(set_attr "type" "mfvsr")
> +   (set_attr "isa" "p8v")])
> +
>  (define_insn "vsx_extract__p9"
>[(set (match_operand: 0

Re: [PATCH v1 1/6] LoongArch: a symmetric multilib subdir layout

On Mon, 2023-08-14 at 15:37 +0800, Yujie Yang wrote:
> On Mon, Aug 14, 2023 at 01:38:40PM +0800, Xi Ruoyao wrote:
> > On Mon, 2023-08-14 at 11:57 +0800, Yang Yujie wrote:
> > 
> > > However, for LoongArch, we do not want such a "toplevel" library
> > > installation since the default ABI may change.  We expect all
> > > multilib variants of libraries to be installed to their designated
> > > ABI-specific subdirs (e.g. base/lp64d) of the GCC libdir, so that
> > > the default ABI can be configured arbitrarily (with --with-abi)
> > > while the gcc libdir layout stays consistent.  This could be
> > > helpful for the distribution packaging of GCC libraries.
> > 
> > Have you tested a --disable-multilib configuration?  To me with --
> > disable-configuration everything should be still in the toplevel
> > directory, not any sub-directory.
> 
> That's a good point, sorry I missed --disable-multilib here.
> 
> However, you don't really need --disable-multilib since
> the libraries are only built once in the default ABI configuration
> as long as --with-multilib-list does not request anything more than
> that.
> 
> Maybe we should force-enabling multilib in all cases.

I really don't like this.  Why must I always remind my self "hey, this
is LoongArch, there is a different directory layout" when I don't need
multilib at all?

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University

Re: [PATCH 2/3] ivopts: Call valid_mem_ref_p with code_helper [PR110248]

2023-08-14 Thread Jan-Benedict Glaw

On Fri, 2023-06-30 13:46:40 +0800, Kewen.Lin via Gcc-patches 
 wrote:
> Bootstrapped and regtested on x86_64-redhat-linux and
> powerpc64{,le}-linux-gnu.
> 
> Is it ok for trunk?
[...]

> diff --git a/gcc/recog.h b/gcc/recog.h
> index badf8e3dc1c..c6ef619c5dd 100644
> --- a/gcc/recog.h
> +++ b/gcc/recog.h
> @@ -20,6 +20,9 @@ along with GCC; see the file COPYING3.  If not see
>  #ifndef GCC_RECOG_H
>  #define GCC_RECOG_H
> 
> +/* For enum tree_code ERROR_MARK.  */
> +#include "tree.h"
> +
>  /* Random number that should be large enough for all purposes.  Also define
> a type that has at least MAX_RECOG_ALTERNATIVES + 1 bits, with the extra
> bit giving an invalid value that can be used to mean "uninitialized".  */

This part breaks for me (up-to-date amd64-linux host, cf. for example
http://toolchain.lug-owl.de/laminar/jobs/gcc-local/82):

configure   '--with-pkgversion=basepoints/gcc-14-3093-g4a8e6fa8016, built 
at 1691996332'\
--prefix=/var/lib/laminar/run/gcc-local/82/toolchain-install
\
--enable-werror-always  
\
--enable-languages=all  
\
--disable-multilib
make V=1 all-gcc

echo timestamp > s-preds-h
TARGET_CPU_DEFAULT="" \
HEADERS="config/i386/i386-d.h" DEFINES="" \
/bin/bash ../../gcc/gcc/mkconfig.sh tm_d.h
/var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11 -c 
  -g -O2   -DIN_GCC-fno-exceptions -fno-rtti -fasynchronous-unwind-tables 
-W -Wall -Wno-narrowing -Wwrite-strings -Wcast-qual -Wmissing-format-attribute 
-Wconditionally-supported -Woverloaded-virtual -pedantic -Wno-long-long 
-Wno-variadic-macros -Wno-overlength-strings -Werror -fno-common  
-DHAVE_CONFIG_H  -DGENERATOR_FILE -I. -Ibuild -I../../gcc/gcc 
-I../../gcc/gcc/build -I../../gcc/gcc/../include  
-I../../gcc/gcc/../libcpp/include  \
 -o build/genflags.o ../../gcc/gcc/genflags.cc
/var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11   
-g -O2   -DIN_GCC-fno-exceptions -fno-rtti -fasynchronous-unwind-tables -W 
-Wall -Wno-narrowing -Wwrite-strings -Wcast-qual -Wmissing-format-attribute 
-Wconditionally-supported -Woverloaded-virtual -pedantic -Wno-long-long 
-Wno-variadic-macros -Wno-overlength-strings -Werror -fno-common  
-DHAVE_CONFIG_H  -DGENERATOR_FILE -static-libstdc++ -static-libgcc  -o 
build/genflags \
build/genflags.o build/rtl.o build/read-rtl.o build/ggc-none.o build/vec.o 
build/min-insn-modes.o build/gensupport.o build/print-rtl.o build/hash-table.o 
build/sort.o build/read-md.o build/errors.o 
../build-x86_64-pc-linux-gnu/libiberty/libiberty.a
/var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11 -c 
  -g -O2   -DIN_GCC-fno-exceptions -fno-rtti -fasynchronous-unwind-tables 
-W -Wall -Wno-narrowing -Wwrite-strings -Wcast-qual -Wmissing-format-attribute 
-Wconditionally-supported -Woverloaded-virtual -pedantic -Wno-long-long 
-Wno-variadic-macros -Wno-overlength-strings -Werror -fno-common  
-DHAVE_CONFIG_H  -DGENERATOR_FILE -I. -Ibuild -I../../gcc/gcc 
-I../../gcc/gcc/build -I../../gcc/gcc/../include  
-I../../gcc/gcc/../libcpp/include  \
 -o build/genconditions.o ../../gcc/gcc/genconditions.cc
/var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11   
-g -O2   -DIN_GCC-fno-exceptions -fno-rtti -fasynchronous-unwind-tables -W 
-Wall -Wno-narrowing -Wwrite-strings -Wcast-qual -Wmissing-format-attribute 
-Wconditionally-supported -Woverloaded-virtual -pedantic -Wno-long-long 
-Wno-variadic-macros -Wno-overlength-strings -Werror -fno-common  
-DHAVE_CONFIG_H  -DGENERATOR_FILE -static-libstdc++ -static-libgcc  -o 
build/genconditions \
build/genconditions.o build/rtl.o build/read-rtl.o build/ggc-none.o 
build/vec.o build/min-insn-modes.o build/gensupport.o build/print-rtl.o 
build/hash-table.o build/sort.o build/read-md.o build/errors.o 
../build-x86_64-pc-linux-gnu/libiberty/libiberty.a
build/genconditions ../../gcc/gcc/common.md ../../gcc/gcc/config/i386/i386.md > 
tmp-condmd.cc
/bin/bash ../../gcc/gcc/../move-if-change tmp-condmd.cc build/gencondmd.cc
echo timestamp > s-conditions
build/genpreds -c ../../gcc/gcc/common.md ../../gcc/gcc/config/i386/i386.md > 
tmp-constrs.h
/bin/bash ../../gcc/gcc/../move-if-change tmp-constrs.h tm-constrs.h
echo timestamp > s-constrs-h
/var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11 -c 
  -g -O2   -DIN_GCC-fno-exceptions -fno-rtti -fasynchronous-unwind-tables 
-W -Wall -Wno-narrowing -Wwrite-strings -Wcast-qual -Wmissing-format-attribute 
-Wconditionally-supported -Woverloaded-virtual -pedantic -Wno-long-long 
-Wno-variadic-macros -Wno-overlength-strings -Werror -fno-common  
-DHAVE_CONFIG_H  -DGENERATOR_FILE -I. -Ibuild -I../../gcc/gcc 
-I../../gcc/gcc/build -I../../gcc/gcc/../include  
-I../../gcc/gcc/../libcpp/include  \
 -o build/g

RE: [PATCH v1] RISC-V: Support RVV VFSQRT rounding mode intrinsic API

Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Monday, August 14, 2023 3:44 PM
To: Li, Pan2 ; gcc-patches 
Cc: Li, Pan2 ; Wang, Yanzhang ; 
kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Support RVV VFSQRT rounding mode intrinsic API

LGTM


juzhe.zh...@rivai.ai

From: pan2.li
Date: 2023-08-14 15:39
To: gcc-patches
CC: juzhe.zhong; 
pan2.li; 
yanzhang.wang; 
kito.cheng
Subject: [PATCH v1] RISC-V: Support RVV VFSQRT rounding mode intrinsic API
From: Pan Li mailto:pan2...@intel.com>>

This patch would like to support the rounding mode API for the
VFSQRT as the below samples.

* __riscv_vfsqrt_v_f32m1_rm
* __riscv_vfsqrt_v_f32m1_rm_m

Signed-off-by: Pan Li mailto:pan2...@intel.com>>

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class unop_frm): New class for frm.
(vfsqrt_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfsqrt_frm): New intrinsic function definition.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-sqrt.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 17 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 ++
.../riscv/rvv/base/float-point-sqrt.c | 31 +++
4 files changed, 51 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index b458560a040..2074dac0f16 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -631,6 +631,21 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfsqrt
+*/
+template
+class unop_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  rtx expand (function_expander &e) const override
+  {
+return e.use_exact_insn (code_for_pred (CODE, e.vector_mode ()));
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2415,6 +2430,7 @@ static CONSTEXPR const vfwmsac_frm vfwmsac_frm_obj;
static CONSTEXPR const vfwnmsac vfwnmsac_obj;
static CONSTEXPR const vfwnmsac_frm vfwnmsac_frm_obj;
static CONSTEXPR const unop vfsqrt_obj;
+static CONSTEXPR const unop_frm vfsqrt_frm_obj;
static CONSTEXPR const float_misc vfrsqrt7_obj;
static CONSTEXPR const float_misc vfrec7_obj;
static CONSTEXPR const binop vfmin_obj;
@@ -2662,6 +2678,7 @@ BASE (vfwmsac_frm)
BASE (vfwnmsac)
BASE (vfwnmsac_frm)
BASE (vfsqrt)
+BASE (vfsqrt_frm)
BASE (vfrsqrt7)
BASE (vfrec7)
BASE (vfmin)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 85e8b9a3769..5c91381bd4c 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -184,6 +184,7 @@ extern const function_base *const vfwmsac_frm;
extern const function_base *const vfwnmsac;
extern const function_base *const vfwnmsac_frm;
extern const function_base *const vfsqrt;
+extern const function_base *const vfsqrt_frm;
extern const function_base *const vfrsqrt7;
extern const function_base *const vfrec7;
extern const function_base *const vfmin;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 7e2a4ab2969..a821aca6a4b 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -388,6 +388,8 @@ DEF_RVV_FUNCTION (vfwnmsac_frm, alu_frm, full_preds, 
f_wwfv_ops)
// 13.8. Vector Floating-Point Square-Root Instruction
DEF_RVV_FUNCTION (vfsqrt, alu, full_preds, f_v_ops)
+DEF_RVV_FUNCTION (vfsqrt_frm, alu_frm, full_preds, f_v_ops)
+
// 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
DEF_RVV_FUNCTION (vfrsqrt7, alu, full_preds, f_v_ops)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c
new file mode 100644
index 000..afd1fb2b8f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-sqrt.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat32m1_t
+test_riscv_vfsqrt_vv_f32m1_rm (vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1_rm (op1, 0, vl);
+}
+
+vfloat32m1_t
+test_vfsqrt_vv_f32m1_rm_m (vbool32_t mask, vfloat32m1_t op1, size_t vl) {
+  return __riscv_vfsqrt_v_f32m1_rm_m (mask, op1, 1, vl);
+}
+
+vfloat32m1_t
+test_riscv_vfsqrt_vv_f32m1 (vfloat32m1_t op1, size_t v

[PATCH] Fix print_loop_info ICE

It ICEs when invoked via debug_loops and dump_file clear.

Pushed as obvious.

* tree-cfg.cc (print_loop_info): Dump to 'file', not 'dump_file'.
---
 gcc/tree-cfg.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index ab1f8067c54..fae80bb5b91 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -8565,7 +8565,7 @@ print_loop_info (FILE *file, const class loop *loop, 
const char *prefix)
   fprintf (file, "\n%siterations by profile: %f (%s%s) entry count:", 
prefix,
   iterations.to_double (), reliable ? "reliable" : "unreliable",
   maybe_flat_loop_profile (loop) ? ", maybe flat" : "");
-  loop_count_in (loop).dump (dump_file, cfun);
+  loop_count_in (loop).dump (file, cfun);
 }
 
 }
-- 
2.35.3

Re: [PATCH v1 2/6] LoongArch: improved target configuration interface

2023-08-14 Thread Yujie Yang

On Mon, Aug 14, 2023 at 01:58:24PM +0800, Xi Ruoyao wrote:
> On Mon, 2023-08-14 at 11:57 +0800, Yang Yujie wrote:
> > * Support options for LoongArch SIMD extensions:
> >   new configure options --with-simd={none,lsx,lasx};
> >   new driver options -m[no]-l[a]sx / -msimd={none,lsx,lasx}.
> 
> I suggest to rename --with-simd= to --with-ext= and accept a comma-
> separated ISA extension list, because we have non-SIMD ISA extensions. 
> For example, "--with-ext=lasx,lbt" will make -mlasx, -mlsx (implied),
> and -mlbt the default.  I prefer "-mlasx" over "-msimd=lasx" because "-
> mlasx" is shorter anyway (if there is no real reason to make -mlasx and
> -msimd=lasx two different things).
> 
> -- 
> Xi Ruoyao 
> School of Aerospace Science and Technology, Xidian University

Thanks for the suggestion, --with-ext seems a good idea.  I will
consider adding it later.

-msimd=lasx and -mlasx are the same if you only use them once, but things
gets complicated if you also want -mno-lsx to cancel the effect of -mlasx.

In short, -msimd= is *necessary* for a correct (and convenient) implementation
of the "legacy" -m[no-]l[a]sx options.  Let's hope I can explain this clearly:

I assume we all want:

 (1) -mlasx -mlsx -> enable LSX and LASX
 (2) -mlasx -mno-lsx -> disable LSX and LASX
 (3) -mno-lsx -mlasx -> enable LSX and LASX

Unless we declare -mlsx / -mlasx as driver deferred, AFAIK there is no other 
way for
us to know the actual order of appearnce of all -m[no-]l[a]sx options on the 
command
line.  All we know from GCC's option system would be a final on/off state of 
"lsx"
and a final on/off state of "lasx".

These states results from independent processing of -m[no-]lsx and -m[no-]lasx 
options
in the order they appear, respectively.  So we can't distinguish between (2) 
and (3).

Another seemingly viable approach is to declare three options -mlsx / -mlasx / 
-mno-lsx
and making them mutually exclusive.  In this way GCC would only consider the 
last one
appearing on the command line.  But we lost (1) in this case.

So, it seems that GCC is forbidding a compiler option to express a "state 
change"
rather than a independent configuration state itself.  This makes sense since
"-grecord-gcc-switches" and the LTO objects prefers that a target configuration
can be uniquely expressed in a "tuple of coordinates", rather than a series of
"connected vectors", so that they can be easily compared.

Now what we want is clear:

1. support convenient options like -mlsx / -mlasx (maybe something like
-m32 / -m64 in the future) that express state changes that interferes with
the effect of prior flags with different names.

2. the options passed to the compiler proper has a clear one-one
correspondence to the target configuration.

The implementation: canonicalize everything in the GCC driver, process the 
"flags"
that express state changes in the order they appear into a group of independent
parameters.  If the parameters they have overlapping semantics, the priority
is hard-coded, like -march and -msimd.

(This is also necessary for multilib since this is where the multilib selection
happens and final ABI needs to be decided before that.)

For this purpose, -msimd= is the canonicalized parameter form of the state
of SIMD extension, and -m[no-]l[a]sx are defined as convenient driver-only
flags.

Re: [PATCH 2/3] ivopts: Call valid_mem_ref_p with code_helper [PR110248]

Hi,

on 2023/8/14 15:53, Jan-Benedict Glaw wrote:
> On Fri, 2023-06-30 13:46:40 +0800, Kewen.Lin via Gcc-patches 
>  wrote:
>> Bootstrapped and regtested on x86_64-redhat-linux and
>> powerpc64{,le}-linux-gnu.
>>
>> Is it ok for trunk?
> [...]
> 
>> diff --git a/gcc/recog.h b/gcc/recog.h
>> index badf8e3dc1c..c6ef619c5dd 100644
>> --- a/gcc/recog.h
>> +++ b/gcc/recog.h
>> @@ -20,6 +20,9 @@ along with GCC; see the file COPYING3.  If not see
>>  #ifndef GCC_RECOG_H
>>  #define GCC_RECOG_H
>>
>> +/* For enum tree_code ERROR_MARK.  */
>> +#include "tree.h"
>> +
>>  /* Random number that should be large enough for all purposes.  Also define
>> a type that has at least MAX_RECOG_ALTERNATIVES + 1 bits, with the extra
>> bit giving an invalid value that can be used to mean "uninitialized".  */
> 
> This part breaks for me (up-to-date amd64-linux host, cf. for example
> http://toolchain.lug-owl.de/laminar/jobs/gcc-local/82):
> 
> configure '--with-pkgversion=basepoints/gcc-14-3093-g4a8e6fa8016, built 
> at 1691996332'\
>   --prefix=/var/lib/laminar/run/gcc-local/82/toolchain-install
> \
>   --enable-werror-always  
> \
>   --enable-languages=all  
> \
>   --disable-multilib
> make V=1 all-gcc
> 
> echo timestamp > s-preds-h
> TARGET_CPU_DEFAULT="" \
> HEADERS="config/i386/i386-d.h" DEFINES="" \
> /bin/bash ../../gcc/gcc/mkconfig.sh tm_d.h
> /var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11 
> -c   -g -O2   -DIN_GCC-fno-exceptions -fno-rtti 
> -fasynchronous-unwind-tables -W -Wall -Wno-narrowing -Wwrite-strings 
> -Wcast-qual -Wmissing-format-attribute -Wconditionally-supported 
> -Woverloaded-virtual -pedantic -Wno-long-long -Wno-variadic-macros 
> -Wno-overlength-strings -Werror -fno-common  -DHAVE_CONFIG_H  
> -DGENERATOR_FILE -I. -Ibuild -I../../gcc/gcc -I../../gcc/gcc/build 
> -I../../gcc/gcc/../include  -I../../gcc/gcc/../libcpp/include  \
>  -o build/genflags.o ../../gcc/gcc/genflags.cc
> /var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11  
>  -g -O2   -DIN_GCC-fno-exceptions -fno-rtti -fasynchronous-unwind-tables 
> -W -Wall -Wno-narrowing -Wwrite-strings -Wcast-qual 
> -Wmissing-format-attribute -Wconditionally-supported -Woverloaded-virtual 
> -pedantic -Wno-long-long -Wno-variadic-macros -Wno-overlength-strings -Werror 
> -fno-common  -DHAVE_CONFIG_H  -DGENERATOR_FILE -static-libstdc++ 
> -static-libgcc  -o build/genflags \
> build/genflags.o build/rtl.o build/read-rtl.o build/ggc-none.o 
> build/vec.o build/min-insn-modes.o build/gensupport.o build/print-rtl.o 
> build/hash-table.o build/sort.o build/read-md.o build/errors.o 
> ../build-x86_64-pc-linux-gnu/libiberty/libiberty.a
> /var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11 
> -c   -g -O2   -DIN_GCC-fno-exceptions -fno-rtti 
> -fasynchronous-unwind-tables -W -Wall -Wno-narrowing -Wwrite-strings 
> -Wcast-qual -Wmissing-format-attribute -Wconditionally-supported 
> -Woverloaded-virtual -pedantic -Wno-long-long -Wno-variadic-macros 
> -Wno-overlength-strings -Werror -fno-common  -DHAVE_CONFIG_H  
> -DGENERATOR_FILE -I. -Ibuild -I../../gcc/gcc -I../../gcc/gcc/build 
> -I../../gcc/gcc/../include  -I../../gcc/gcc/../libcpp/include  \
>  -o build/genconditions.o ../../gcc/gcc/genconditions.cc
> /var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11  
>  -g -O2   -DIN_GCC-fno-exceptions -fno-rtti -fasynchronous-unwind-tables 
> -W -Wall -Wno-narrowing -Wwrite-strings -Wcast-qual 
> -Wmissing-format-attribute -Wconditionally-supported -Woverloaded-virtual 
> -pedantic -Wno-long-long -Wno-variadic-macros -Wno-overlength-strings -Werror 
> -fno-common  -DHAVE_CONFIG_H  -DGENERATOR_FILE -static-libstdc++ 
> -static-libgcc  -o build/genconditions \
> build/genconditions.o build/rtl.o build/read-rtl.o build/ggc-none.o 
> build/vec.o build/min-insn-modes.o build/gensupport.o build/print-rtl.o 
> build/hash-table.o build/sort.o build/read-md.o build/errors.o 
> ../build-x86_64-pc-linux-gnu/libiberty/libiberty.a
> build/genconditions ../../gcc/gcc/common.md ../../gcc/gcc/config/i386/i386.md 
> > tmp-condmd.cc
> /bin/bash ../../gcc/gcc/../move-if-change tmp-condmd.cc build/gencondmd.cc
> echo timestamp > s-conditions
> build/genpreds -c ../../gcc/gcc/common.md ../../gcc/gcc/config/i386/i386.md > 
> tmp-constrs.h
> /bin/bash ../../gcc/gcc/../move-if-change tmp-constrs.h tm-constrs.h
> echo timestamp > s-constrs-h
> /var/lib/laminar/run/gcc-local/82/local-toolchain-install/bin/g++ -std=c++11 
> -c   -g -O2   -DIN_GCC-fno-exceptions -fno-rtti 
> -fasynchronous-unwind-tables -W -Wall -Wno-narrowing -Wwrite-strings 
> -Wcast-qual -Wmissing-format-attribute -Wconditionally-supported 
> -Woverloaded-virtual -pedantic -Wno-long-long -Wno-variadic-macros

Re: [PATCH v1 2/6] LoongArch: improved target configuration interface

On Mon, 2023-08-14 at 16:44 +0800, Yujie Yang wrote:
> I assume we all want:
> 
>  (1) -mlasx -mlsx -> enable LSX and LASX
>  (2) -mlasx -mno-lsx -> disable LSX and LASX
>  (3) -mno-lsx -mlasx -> enable LSX and LASX

Yes.

> Unless we declare -mlsx / -mlasx as driver deferred, AFAIK there is no other 
> way for
> us to know the actual order of appearnce of all -m[no-]l[a]sx options on the 
> command
> line.  All we know from GCC's option system would be a final on/off state of 
> "lsx"
> and a final on/off state of "lasx".

But x86 does this correct;

$ echo __AVX__ + __AVX2__ | LANG= cpp -E -mno-avx -mavx2
# 0 ""
# 0 ""
# 0 ""
# 1 "/usr/include/stdc-predef.h" 1 3 4
# 0 "" 2
# 1 ""
1 + 1

so there must be a way to handle this...

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University

Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

Hi Juzhe,

on 2023/8/14 15:09, juzhe.zh...@rivai.ai wrote:
> Thanks Richi.
> 
> CC kewen to see whether this patch is suitable for powerpc and s390.

I did a bootstrapping and regression testing on Power10 (LE) and found a lot of 
failures.

A short list looks like:

< FAIL: gcc.c-torture/compile/20150108.c   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler 
error: in expand_vec_extract_optab_fn,
at internal-fn.cc:3164)
< FAIL: gcc.c-torture/compile/20150108.c   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess 
errors)
< FAIL: gcc.c-torture/compile/20150108.c   -O3 -g  (internal compiler error: in 
expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.c-torture/compile/20150108.c   -O3 -g  (test for excess errors)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler 
error: in expand_vec_extract_optab_fn,
at internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess 
errors)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -g  (internal compiler error: 
in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -g  (test for excess errors)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler 
error: in expand_vec_extract_optab_fn, at
internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess 
errors)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -g  (internal compiler error: in 
expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -g  (test for excess errors)
< FAIL: gcc.dg/pr84321.c (internal compiler error: in 
expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/pr84321.c (test for excess errors)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  (internal compiler error: in 
expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -g  (internal compiler error: in 
expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -g  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  (internal compiler error: in 
expand_vec_extract_optab_fn, at
internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -g  (internal compiler error: in 
expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -g  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070.c   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  (internal compiler error: in 
expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr51070.c   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070.c   -O3 -g  (internal compiler error: in 
expand_vec_extract_optab_fn, at internal-fn.cc:3164)


> 
> --
> juzhe.zh...@rivai.ai
> 
>  
> *From:* Richard Biener 
> *Date:* 2023-08-14 14:53
> *To:* Ju-Zhe Zhong 
> *CC:* gcc-patches ; richard.sandiford 
> 
>

[PATCH] vect: Remove several useless VMAT_INVARIANT checks

Hi,

In function vectorizable_load, there is one hunk which is
dedicated for the handlings on VMAT_INVARIANT and return
early, it means we shouldn't encounter any cases with
memory_access_type VMAT_INVARIANT in the following code
after that.  This patch is to clean up several useless
checks on VMAT_INVARIANT.  There should be no functional
changes.

Bootstrapped and regtested on x86_64-redhat-linux,
aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_load): Remove some useless checks
on VMAT_INVARIANT.
---
 gcc/tree-vect-stmts.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 89607a98f99..d4e781531fd 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10499,7 +10499,7 @@ vectorizable_load (vec_info *vinfo,
  tree bias = NULL_TREE;
  if (!costing_p)
{
- if (loop_masks && memory_access_type != VMAT_INVARIANT)
+ if (loop_masks)
final_mask
  = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
vec_num * ncopies, vectype,
@@ -10729,7 +10729,7 @@ vectorizable_load (vec_info *vinfo,
bias = build_int_cst (intQI_type_node, biasval);
  }

-   if (final_len && memory_access_type != VMAT_INVARIANT)
+   if (final_len)
  {
tree ptr
  = build_int_cst (ref_type, align * BITS_PER_UNIT);
--
2.31.1

[PATCH] vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest

Hi,

Following Richi's suggestion [1], this patch is to move the
handlings on VMAT_LOAD_STORE_LANES in the final loop nest
of function vectorizable_load to its own loop.  Basically
it duplicates the final loop nest, clean up some useless
set up code for the case of VMAT_LOAD_STORE_LANES, remove
some unreachable code.  Also remove the corresponding
handlings in the final loop nest.

Bootstrapped and regtested on x86_64-redhat-linux,
aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html

gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
and update the final nest accordingly.
---
 gcc/tree-vect-stmts.cc | 1275 
 1 file changed, 634 insertions(+), 641 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 4f2d088484c..c361e16cb7b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo,
vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
   &vec_masks, mask_vectype);
 }
+
   tree vec_mask = NULL_TREE;
+  if (memory_access_type == VMAT_LOAD_STORE_LANES)
+{
+  gcc_assert (alignment_support_scheme == dr_aligned
+ || alignment_support_scheme == dr_unaligned_supported);
+  gcc_assert (grouped_load && !slp);
+
+  unsigned int inside_cost = 0, prologue_cost = 0;
+  for (j = 0; j < ncopies; j++)
+   {
+ if (costing_p)
+   {
+ /* An IFN_LOAD_LANES will load all its vector results,
+regardless of which ones we actually need.  Account
+for the cost of unused results.  */
+ if (first_stmt_info == stmt_info)
+   {
+ unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
+ stmt_vec_info next_stmt_info = first_stmt_info;
+ do
+   {
+ gaps -= 1;
+ next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
+   }
+ while (next_stmt_info);
+ if (gaps)
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"vect_model_load_cost: %d "
+"unused vectors.\n",
+gaps);
+ vect_get_load_cost (vinfo, stmt_info, gaps,
+ alignment_support_scheme,
+ misalignment, false, &inside_cost,
+ &prologue_cost, cost_vec, cost_vec,
+ true);
+   }
+   }
+ vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
+ misalignment, false, &inside_cost,
+ &prologue_cost, cost_vec, cost_vec, true);
+ continue;
+   }
+
+ /* 1. Create the vector or array pointer update chain.  */
+ if (j == 0)
+   dataref_ptr
+ = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
+ at_loop, offset, &dummy, gsi,
+ &ptr_incr, false, bump);
+ else
+   {
+ gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
+ dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
+stmt_info, bump);
+   }
+ if (mask)
+   vec_mask = vec_masks[j];
+
+ tree vec_array = create_vector_array (vectype, vec_num);
+
+ tree final_mask = NULL_TREE;
+ if (loop_masks)
+   final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+ncopies, vectype, j);
+ if (vec_mask)
+   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
+  vec_mask, gsi);
+
+ gcall *call;
+ if (final_mask)
+   {
+ /* Emit:
+  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
+   VEC_MASK).  */
+ unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
+ tree alias_ptr = build_int_cst (ref_type, align);
+ call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
+dataref_ptr, alias_ptr,
+final_mask);
+   }
+ else
+   {
+ /* Emit:
+  VEC_

Re: [PATCH v1 2/6] LoongArch: improved target configuration interface

On Mon, 2023-08-14 at 16:57 +0800, Yujie Yang wrote:
> On Mon, Aug 14, 2023 at 04:49:11PM +0800, Xi Ruoyao wrote:
> > On Mon, 2023-08-14 at 16:44 +0800, Yujie Yang wrote:
> > > I assume we all want:
> > > 
> > >  (1) -mlasx -mlsx -> enable LSX and LASX
> > >  (2) -mlasx -mno-lsx -> disable LSX and LASX
> > >  (3) -mno-lsx -mlasx -> enable LSX and LASX
> > 
> > Yes.
> > 
> > > Unless we declare -mlsx / -mlasx as driver deferred, AFAIK there is no 
> > > other way for
> > > us to know the actual order of appearnce of all -m[no-]l[a]sx options on 
> > > the command
> > > line.  All we know from GCC's option system would be a final on/off state 
> > > of "lsx"
> > > and a final on/off state of "lasx".
> > 
> > But x86 does this correct;
> > 
> > $ echo __AVX__ + __AVX2__ | LANG= cpp -E -mno-avx -mavx2
> > # 0 ""
> > # 0 ""
> > # 0 ""
> > # 1 "/usr/include/stdc-predef.h" 1 3 4
> > # 0 "" 2
> > # 1 ""
> > 1 + 1
> > 
> > so there must be a way to handle this...
> > 
> > -- 
> > Xi Ruoyao 
> > School of Aerospace Science and Technology, Xidian University
> 
> Emm... What happens if you reverse the order?
> 
> $ echo __AVX__ + __AVX2__ | LANG= cpp -E -mavx2 -mno-avx
> 
> Anyways, I believe there may be other ways to implement this, but it would
> require equally much effort (or even much more) that the current approach.
> Especially considering the possiblity of future updates -- we now have a
> framework for this sort of things.
> 
> Meanwhile you confortably can stay away from -msimd= and use only
> -mlsx / -mlasx. So...a matter of style maybe?

I'm OK with that, but we need to document it clearly in invoke.texi.

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University

Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

2023-08-14 Thread Robin Dapp via Gcc-patches

Hi Kewen,

> I did a bootstrapping and regression testing on Power10 (LE) and found a lot 
> of failures.

I think the problem is that just like for vec_set we're expecting
the vec_extract expander not to fail.  It is probably passed not a
const int here anymore and therefore fails to expand?

can_vec_extract_var_idx_p is supposed to check if the backend
supports extracting a variable index.

Regards
 Robin

[PATCH] vect: Move VMAT_GATHER_SCATTER handlings from final loop nest

Hi,

Following Richi's suggestion [1], this patch is to move the
handlings on VMAT_GATHER_SCATTER in the final loop nest
of function vectorizable_load to its own loop.  Basically
it duplicates the final loop nest, clean up some useless
set up code for the case of VMAT_GATHER_SCATTER, remove some
unreachable code.  Also remove the corresponding handlings
in the final loop nest.

Bootstrapped and regtested on x86_64-redhat-linux,
aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html

Is it ok for trunk?

BR,
Kewen
-

gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
VMAT_GATHER_SCATTER in the final loop nest to its own loop,
and update the final nest accordingly.
---
 gcc/tree-vect-stmts.cc | 361 +
 1 file changed, 219 insertions(+), 142 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c361e16cb7b..5e514eca19b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10455,6 +10455,218 @@ vectorizable_load (vec_info *vinfo,
   return true;
 }

+  if (memory_access_type == VMAT_GATHER_SCATTER)
+{
+  gcc_assert (alignment_support_scheme == dr_aligned
+ || alignment_support_scheme == dr_unaligned_supported);
+  gcc_assert (!grouped_load && !slp_perm);
+
+  unsigned int inside_cost = 0, prologue_cost = 0;
+  for (j = 0; j < ncopies; j++)
+   {
+ /* 1. Create the vector or array pointer update chain.  */
+ if (j == 0 && !costing_p)
+   {
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+   vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
+slp_node, &gs_info, &dataref_ptr,
+&vec_offsets);
+ else
+   dataref_ptr
+ = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
+ at_loop, offset, &dummy, gsi,
+ &ptr_incr, false, bump);
+   }
+ else if (!costing_p)
+   {
+ gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
+ if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+  gsi, stmt_info, bump);
+   }
+
+ if (mask && !costing_p)
+   vec_mask = vec_masks[j];
+
+ gimple *new_stmt = NULL;
+ for (i = 0; i < vec_num; i++)
+   {
+ tree final_mask = NULL_TREE;
+ tree final_len = NULL_TREE;
+ tree bias = NULL_TREE;
+ if (!costing_p)
+   {
+ if (loop_masks)
+   final_mask
+ = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+   vec_num * ncopies, vectype,
+   vec_num * j + i);
+ if (vec_mask)
+   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+  final_mask, vec_mask, gsi);
+
+ if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+  gsi, stmt_info, bump);
+   }
+
+ /* 2. Create the vector-load in the loop.  */
+ unsigned HOST_WIDE_INT align;
+ if (gs_info.ifn != IFN_LAST)
+   {
+ if (costing_p)
+   {
+ unsigned int cnunits = vect_nunits_for_cost (vectype);
+ inside_cost
+   = record_stmt_cost (cost_vec, cnunits, scalar_load,
+   stmt_info, 0, vect_body);
+ continue;
+   }
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+   vec_offset = vec_offsets[vec_num * j + i];
+ tree zero = build_zero_cst (vectype);
+ tree scale = size_int (gs_info.scale);
+
+ if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
+   {
+ if (loop_lens)
+   final_len
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+  vec_num * ncopies, vectype,
+  vec_num * j + i, 1);
+ else
+   final_len
+ = build_int_cst (sizetype,
+  TYPE_VECTOR_SUBPARTS (vectype));
+ signed char biasval
+   = LOOP_VINFO_PARTIAL_LOAD_

Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

On Mon, 14 Aug 2023, Robin Dapp wrote:

> Hi Kewen,
> 
> > I did a bootstrapping and regression testing on Power10 (LE) and found a 
> > lot of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?
> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.

expansion does

  enum insn_code icode = convert_optab_handler (optab, outermode,
extract_mode);

  if (icode != CODE_FOR_nothing)
{
  create_output_operand (&ops[0], target, extract_mode);
  create_input_operand (&ops[1], src, outermode);
  create_convert_operand_from (&ops[2], pos,
   TYPE_MODE (TREE_TYPE (op1)), true);
  if (maybe_expand_insn (icode, 3, ops))
{
  if (!rtx_equal_p (target, ops[0].value))
emit_move_insn (target, ops[0].value);
  return;
}

<--- here

}
  gcc_unreachable ();

so if maybe_expand_insn fails that would need to be sth we need
to cover in the predicate to check.  But that looks possibly
target dependent?  What does actually fail here?

Richard.

Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

Hi Robin,

on 2023/8/14 16:58, Robin Dapp wrote:
> Hi Kewen,
> 
>> I did a bootstrapping and regression testing on Power10 (LE) and found a lot 
>> of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?

Thanks for the comments!  Yeah, I think the expectation doesn't hold
on Power, as our vec_extract optab only support const index, that
is:

(define_expand "vec_extract"
  [(match_operand: 0 "register_operand")
   (match_operand:VEC_E 1 "vlogical_operand")
   (match_operand 2 "const_int_operand")]
  "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
{
  rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
  DONE;
})

> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.

OK, it sounds that this new capability needs to further check with
function can_vec_extract_var_idx_p to ensure the ifn expanding work
as expected.  I re-spined by adding the below as your comments:

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..80ba5cae84a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
   else if (convert_optab_handler (vec_extract_optab,
   TYPE_MODE (vectype),
   TYPE_MODE (TREE_TYPE (vectype)))
-   != CODE_FOR_nothing)
+ != CODE_FOR_nothing
+   && can_vec_extract_var_idx_p (
+ TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype
 vect_record_loop_len (loop_vinfo,
   &LOOP_VINFO_LENS (loop_vinfo),
   1, vectype, 1);

BR,
Kewen

Re: [PATCH v1 2/6] LoongArch: improved target configuration interface

2023-08-14 Thread Yujie Yang

On Mon, Aug 14, 2023 at 04:49:11PM +0800, Xi Ruoyao wrote:
> On Mon, 2023-08-14 at 16:44 +0800, Yujie Yang wrote:
> > I assume we all want:
> > 
> >  (1) -mlasx -mlsx -> enable LSX and LASX
> >  (2) -mlasx -mno-lsx -> disable LSX and LASX
> >  (3) -mno-lsx -mlasx -> enable LSX and LASX
> 
> Yes.
> 
> > Unless we declare -mlsx / -mlasx as driver deferred, AFAIK there is no 
> > other way for
> > us to know the actual order of appearnce of all -m[no-]l[a]sx options on 
> > the command
> > line.  All we know from GCC's option system would be a final on/off state 
> > of "lsx"
> > and a final on/off state of "lasx".
> 
> But x86 does this correct;
> 
> $ echo __AVX__ + __AVX2__ | LANG= cpp -E -mno-avx -mavx2
> # 0 ""
> # 0 ""
> # 0 ""
> # 1 "/usr/include/stdc-predef.h" 1 3 4
> # 0 "" 2
> # 1 ""
> 1 + 1
> 
> so there must be a way to handle this...
> 
> -- 
> Xi Ruoyao 
> School of Aerospace Science and Technology, Xidian University

Emm... What happens if you reverse the order?

$ echo __AVX__ + __AVX2__ | LANG= cpp -E -mavx2 -mno-avx

Anyways, I believe there may be other ways to implement this, but it would
require equally much effort (or even much more) that the current approach.
Especially considering the possiblity of future updates -- we now have a
framework for this sort of things.

Meanwhile you confortably can stay away from -msimd= and use only
-mlsx / -mlasx. So...a matter of style maybe?

Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

-   != CODE_FOR_nothing)
+ != CODE_FOR_nothing
+   && can_vec_extract_var_idx_p (
+ TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype

I think maybe 'can_vec_extract_var_idx_p' check is enough and remove 
convert_optab_handler (vec_extract_optab,... check.
Looking forward Richi's more comments.

Thanks.


juzhe.zh...@rivai.ai
 
From: Kewen.Lin
Date: 2023-08-14 17:19
To: Robin Dapp
CC: gcc-patches; richard.sandiford; rguenther; juzhe.zh...@rivai.ai
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST 
vectorization
Hi Robin,
 
on 2023/8/14 16:58, Robin Dapp wrote:
> Hi Kewen,
> 
>> I did a bootstrapping and regression testing on Power10 (LE) and found a lot 
>> of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?
 
Thanks for the comments!  Yeah, I think the expectation doesn't hold
on Power, as our vec_extract optab only support const index, that
is:
 
(define_expand "vec_extract"
  [(match_operand: 0 "register_operand")
   (match_operand:VEC_E 1 "vlogical_operand")
   (match_operand 2 "const_int_operand")]
  "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
{
  rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
  DONE;
})
 
> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.
 
OK, it sounds that this new capability needs to further check with
function can_vec_extract_var_idx_p to ensure the ifn expanding work
as expected.  I re-spined by adding the below as your comments:
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..80ba5cae84a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
   else if (convert_optab_handler (vec_extract_optab,
   TYPE_MODE (vectype),
   TYPE_MODE (TREE_TYPE (vectype)))
-   != CODE_FOR_nothing)
+ != CODE_FOR_nothing
+   && can_vec_extract_var_idx_p (
+ TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype
 vect_record_loop_len (loop_vinfo,
   &LOOP_VINFO_LENS (loop_vinfo),
   1, vectype, 1);
 
BR,
Kewen

Re: [gcc r13-7720] x86: Update model values for Raptorlake.

2023-08-14 Thread Florian Weimer via Gcc-patches

* Lili Cui via Gcc-cvs:

> https://gcc.gnu.org/g:0fa76e35a5f9e141c08fdf151380f2f9689101c7
>
> commit r13-7720-g0fa76e35a5f9e141c08fdf151380f2f9689101c7
> Author: Cui, Lili 
> Date:   Mon Aug 14 02:06:00 2023 +
>
> x86: Update model values for Raptorlake.
> 
> Update model values for Raptorlake according to SDM.
> 
> gcc/ChangeLog
> 
> * common/config/i386/cpuinfo.h (get_intel_cpu): Add model value 
> 0xba
> to Raptorlake.
>
> Diff:
> ---
>  gcc/common/config/i386/cpuinfo.h | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/gcc/common/config/i386/cpuinfo.h 
> b/gcc/common/config/i386/cpuinfo.h
> index 81f8b1766f8d..d6f2b7e3cfb3 100644
> --- a/gcc/common/config/i386/cpuinfo.h
> +++ b/gcc/common/config/i386/cpuinfo.h
> @@ -539,6 +539,8 @@ get_intel_cpu (struct __processor_model *cpu_model,
>  case 0xbf:
>/* Alder Lake.  */
>  case 0xb7:
> +case 0xba:
> +case 0xbf:
>/* Raptor Lake.  */
>  case 0xaa:
>  case 0xac:

This mismerge breaks the build (duplicate 0xbf case values).

Thanks,
Florian

Bootstrap fail on GCC 13 (was: Re: [PATCH] x86: Update model values for Alderlake, Rocketlake and Raptorlake.)

2023-08-14 Thread Tobias Burnus

Hi,

your GCC 13 commit
https://gcc.gnu.org/r13-7720-g0fa76e35a5f9e1 x86: Update model values for 
Raptorlake.

causes a build fail:

gcc/common/config/i386/cpuinfo.h: In function ‘const char* 
get_intel_cpu(__processor_model*, __processor_model2*, unsigned int*)’:
gcc/common/config/i386/cpuinfo.h:543:5: error: duplicate case value
  543 | case 0xbf:
  | ^~~~
gcc/common/config/i386/cpuinfo.h:539:5: note: previously used here
  539 | case 0xbf:
  | ^~~~

Your patch did:

 case 0x97:
 case 0x9a:
 case 0xbf:   << Existing case value
   /* Alder Lake.  */
 case 0xb7:
+case 0xba:
+case 0xbf:  << Newly added same case value
   /* Raptor Lake.  */


Tobias

On 29.06.23 05:06, Cui, Lili via Gcc-patches wrote:

I will directly commit this patch, it can be considered as an obvious patch.

Thanks,
Lili.


-Original Message-
From: Gcc-patches  On
Behalf Of Cui, Lili via Gcc-patches
Sent: Wednesday, June 28, 2023 6:52 PM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao 
Subject: [PATCH] x86: Update model values for Alderlake, Rocketlake and
Raptorlake.

Hi Hongtao,

This patch is to update model values for Alderlake, Rocketlake and
Raptorlake according to SDM.

Ok for trunk?

Thanks.
Lili.

Update model values for Alderlake, Rocketlake and Raptorlake according to
SDM.

gcc/ChangeLog

 * common/config/i386/cpuinfo.h (get_intel_cpu): Remove model
value 0xa8
 from Rocketlake, move model value 0xbf from Alderlake to
Raptorlake.
---
  gcc/common/config/i386/cpuinfo.h | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h
b/gcc/common/config/i386/cpuinfo.h
index 61559ed9de2..ae48bc17771 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -463,7 +463,6 @@ get_intel_cpu (struct __processor_model
*cpu_model,
cpu_model->__cpu_subtype = INTEL_COREI7_SKYLAKE;
break;
  case 0xa7:
-case 0xa8:
/* Rocket Lake.  */
cpu = "rocketlake";
CHECK___builtin_cpu_is ("corei7"); @@ -536,9 +535,9 @@ get_intel_cpu
(struct __processor_model *cpu_model,
break;
  case 0x97:
  case 0x9a:
-case 0xbf:
/* Alder Lake.  */
  case 0xb7:
+case 0xbf:
/* Raptor Lake.  */
  case 0xaa:
  case 0xac:
--
2.25.1

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955

[PATCH] genrecog: Add SUBREG_BYTE.to_constant check to the genrecog

2023-08-14 Thread Juzhe-Zhong

Hi, there is genrecog issue happens in RISC-V backend.

This is the ICE info:

0xfa3ba4 poly_int_pod<2u, unsigned short>::to_constant() const
../../../riscv-gcc/gcc/poly-int.h:504
0x28eaa91 recog_5
../../../riscv-gcc/gcc/config/riscv/bitmanip.md:314
0x28ec5b4 recog_7
../../../riscv-gcc/gcc/config/riscv/iterators.md:81
0x2a2e740 recog_436
../../../riscv-gcc/gcc/config/riscv/thead.md:265
0x2a729ef recog_475
../../../riscv-gcc/gcc/config/riscv/sync.md:509
0x2a75aec recog(rtx_def*, rtx_insn*, int*)
../../../riscv-gcc/gcc/config/riscv/iterators.md:55
0x2b3e39e recog_for_combine_1
../../../riscv-gcc/gcc/combine.cc:11382
0x2b3f457 recog_for_combine
../../../riscv-gcc/gcc/combine.cc:11652
0x2b25a15 try_combine
../../../riscv-gcc/gcc/combine.cc:4054
0x2b1d3f1 combine_instructions
../../../riscv-gcc/gcc/combine.cc:1266
0x2b48cfc rest_of_handle_combine
../../../riscv-gcc/gcc/combine.cc:15063
0x2b48db8 execute
../../../riscv-gcc/gcc/combine.cc:15107

This is because the genrecog code here cause ICE for scalable vector in RISC-V:

Before this patch:

static int
recog_5 (rtx x1 ATTRIBUTE_UNUSED,
rtx_insn *insn ATTRIBUTE_UNUSED,
int *pnum_clobbers ATTRIBUTE_UNUSED)
{
  rtx * const operands ATTRIBUTE_UNUSED = &recog_data.operand[0];
  rtx x2, x3, x4;
  int res ATTRIBUTE_UNUSED;
  if (pnum_clobbers == NULL)
return -1;
  x2 = XEXP (x1, 1);
  x3 = XEXP (x2, 0);
  if (maybe_ne (SUBREG_BYTE (x3).to_constant (), 0) ---> this code cause ICE.
  || GET_MODE (x3) != E_SImode
  || !register_operand (operands[0], E_DImode)
  || GET_MODE (x2) != E_DImode)
return -1;
...

This ICE happens since we have following RTL IR:

(insn 27 26 29 4 (set (reg:RVVM1HI 155 [ vect__12.23 ])
(sign_extend:RVVM1HI (subreg:RVVMF2QI (reg:RVVMF2x2QI 146 [ 
vect_array.19 ]) [8, 8]))) "auto.c":29:1 discrim 2 12570 
{extendrvvmf2qirvvm1hi2}
 (expr_list:REG_DEAD (reg:RVVMF2x2QI 146 [ vect_array.19 ])
(nil)))

This is the scalable vector with SUBREG_BYTE = poly (8, 8)

After this patch:

static int
recog_5 (rtx x1 ATTRIBUTE_UNUSED,
rtx_insn *insn ATTRIBUTE_UNUSED,
int *pnum_clobbers ATTRIBUTE_UNUSED)
{
  rtx * const operands ATTRIBUTE_UNUSED = &recog_data.operand[0];
  rtx x2, x3, x4;
  int res ATTRIBUTE_UNUSED;
  if (pnum_clobbers == NULL)
return -1;
  x2 = XEXP (x1, 1);
  x3 = XEXP (x2, 0);
  if ((SUBREG_BYTE (x3).is_constant () && maybe_ne (SUBREG_BYTE 
(x3).to_constant (), 0))   > change here and fix ICE.
  || GET_MODE (x3) != E_SImode
  || !register_operand (operands[0], E_DImode)
  || GET_MODE (x2) != E_DImode)
return -1;

Does it reasonable ?

Thanks.

gcc/ChangeLog:

* genrecog.cc (print_test): Add SUBREG_BYTE.to_constant () check.

---
 gcc/genrecog.cc | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/genrecog.cc b/gcc/genrecog.cc
index 04a5533ca4b..28884ab3985 100644
--- a/gcc/genrecog.cc
+++ b/gcc/genrecog.cc
@@ -4705,11 +4705,14 @@ print_test (output_state *os, const rtx_test &test, 
bool is_param,
   break;
 
 case rtx_test::SUBREG_FIELD:
+  printf ("(SUBREG_BYTE (");
+  print_test_rtx (os, test);
+  printf (").is_constant () && ");
   printf ("%s (", invert_p ? "maybe_ne" : "known_eq");
   print_nonbool_test (os, test);
   printf (", ");
   print_label_value (test, is_param, value);
-  printf (")");
+  printf ("))");
   break;
 
 case rtx_test::SAVED_CONST_INT:
-- 
2.36.3

[PATCH] Fix for bug libstdc++/110860

2023-08-14 Thread Paul Dreik via Gcc-patches

The patch below fixes an issue with the fix already committed for 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110860 which unfortunately 
was not sufficient for small magnitude floating point values.


With the patch in place, the code now survives the fuzzing I used to 
find the problem in the first place. Tested on amd64.


I prepared the patch using git show, which should include the signoff as 
instructed per the DCO.


Thanks, Paul


commit 848b8d948787495e64ed9c55d681eccf730b74fb
Author: Paul Dreik 
Date:   Mon Aug 14 11:52:30 2023 +0200

libstdc++: Avoid problematic use of log10 in std::format [PR110860]

If abs(__v) is smaller than one, the result will be on the
form 0.x. It is only if the magnitude is large that more digits
are needed before the decimal dot.

This uses frexp instead of log10 which should be less expensive
and have sufficient precision for the desired purpose.

It removes the problematic cases where log10 will be negative or not
fit in an int.

Signed-off-by: Paul Dreik 

diff --git a/libstdc++-v3/include/std/format 
b/libstdc++-v3/include/std/format

index f4520ff3f..729e3d4b9 100644
--- a/libstdc++-v3/include/std/format
+++ b/libstdc++-v3/include/std/format
@@ -1490,14 +1490,22 @@ namespace __format
  // If the buffer is too small it's probably because of a large
  // precision, or a very large value in fixed format.
  size_t __guess = 8 + __prec;
- if (__fmt == chars_format::fixed && __v != 0) // +ddd.prec
+ if (__fmt == chars_format::fixed) // +ddd.prec
{
- if constexpr (is_same_v<_Fp, float>)
-   __guess += __builtin_log10f(__v < 0.0f ? -__v : __v);
- else if constexpr (is_same_v<_Fp, double>)
-   __guess += __builtin_log10(__v < 0.0 ? -__v : __v);
- else if constexpr (is_same_v<_Fp, long double>)
-   __guess += __builtin_log10l(__v < 0.0l ? -__v : __v);
+		  if constexpr (is_same_v<_Fp, float> || is_same_v<_Fp, double> || 
is_same_v<_Fp, long double>)

+   {
+ // the number of digits to the left of the decimal point
+ // is floor(log10(max(abs(__v),1)))+1
+ int __exp{};
+ if constexpr (is_same_v<_Fp, float>)
+   __builtin_frexpf(__v, &__exp);
+ else if constexpr (is_same_v<_Fp, double>)
+   __builtin_frexp(__v, &__exp);
+ else if constexpr (is_same_v<_Fp, long double>)
+   __builtin_frexpl(__v, &__exp);
+ if (__exp>0)
+   __guess += 1U + __exp * 4004U / 13301U; // log10(2) 
approx.
+   }
  else
__guess += numeric_limits<_Fp>::max_exponent10;
}


OpenPGP_signature
Description: OpenPGP digital signature

Re: [PATCH] genrecog: Add SUBREG_BYTE.to_constant check to the genrecog

2023-08-14 Thread Richard Sandiford via Gcc-patches

Juzhe-Zhong  writes:
> Hi, there is genrecog issue happens in RISC-V backend.
>
> This is the ICE info:
>
> 0xfa3ba4 poly_int_pod<2u, unsigned short>::to_constant() const
> ../../../riscv-gcc/gcc/poly-int.h:504
> 0x28eaa91 recog_5
> ../../../riscv-gcc/gcc/config/riscv/bitmanip.md:314
> 0x28ec5b4 recog_7
> ../../../riscv-gcc/gcc/config/riscv/iterators.md:81
> 0x2a2e740 recog_436
> ../../../riscv-gcc/gcc/config/riscv/thead.md:265
> 0x2a729ef recog_475
> ../../../riscv-gcc/gcc/config/riscv/sync.md:509
> 0x2a75aec recog(rtx_def*, rtx_insn*, int*)
> ../../../riscv-gcc/gcc/config/riscv/iterators.md:55
> 0x2b3e39e recog_for_combine_1
> ../../../riscv-gcc/gcc/combine.cc:11382
> 0x2b3f457 recog_for_combine
> ../../../riscv-gcc/gcc/combine.cc:11652
> 0x2b25a15 try_combine
> ../../../riscv-gcc/gcc/combine.cc:4054
> 0x2b1d3f1 combine_instructions
> ../../../riscv-gcc/gcc/combine.cc:1266
> 0x2b48cfc rest_of_handle_combine
> ../../../riscv-gcc/gcc/combine.cc:15063
> 0x2b48db8 execute
> ../../../riscv-gcc/gcc/combine.cc:15107
>
> This is because the genrecog code here cause ICE for scalable vector in 
> RISC-V:
>
> Before this patch:
>
> static int
> recog_5 (rtx x1 ATTRIBUTE_UNUSED,
> rtx_insn *insn ATTRIBUTE_UNUSED,
> int *pnum_clobbers ATTRIBUTE_UNUSED)
> {
>   rtx * const operands ATTRIBUTE_UNUSED = &recog_data.operand[0];
>   rtx x2, x3, x4;
>   int res ATTRIBUTE_UNUSED;
>   if (pnum_clobbers == NULL)
> return -1;
>   x2 = XEXP (x1, 1);
>   x3 = XEXP (x2, 0);
>   if (maybe_ne (SUBREG_BYTE (x3).to_constant (), 0) ---> this code cause ICE.
>   || GET_MODE (x3) != E_SImode
>   || !register_operand (operands[0], E_DImode)
>   || GET_MODE (x2) != E_DImode)
> return -1;
> ...
>
> This ICE happens since we have following RTL IR:
>
> (insn 27 26 29 4 (set (reg:RVVM1HI 155 [ vect__12.23 ])
> (sign_extend:RVVM1HI (subreg:RVVMF2QI (reg:RVVMF2x2QI 146 [ 
> vect_array.19 ]) [8, 8]))) "auto.c":29:1 discrim 2 12570 
> {extendrvvmf2qirvvm1hi2}
>  (expr_list:REG_DEAD (reg:RVVMF2x2QI 146 [ vect_array.19 ])
> (nil)))
>
> This is the scalable vector with SUBREG_BYTE = poly (8, 8)
>
> After this patch:
>
> static int
> recog_5 (rtx x1 ATTRIBUTE_UNUSED,
> rtx_insn *insn ATTRIBUTE_UNUSED,
> int *pnum_clobbers ATTRIBUTE_UNUSED)
> {
>   rtx * const operands ATTRIBUTE_UNUSED = &recog_data.operand[0];
>   rtx x2, x3, x4;
>   int res ATTRIBUTE_UNUSED;
>   if (pnum_clobbers == NULL)
> return -1;
>   x2 = XEXP (x1, 1);
>   x3 = XEXP (x2, 0);
>   if ((SUBREG_BYTE (x3).is_constant () && maybe_ne (SUBREG_BYTE 
> (x3).to_constant (), 0))   > change here and fix ICE.
>   || GET_MODE (x3) != E_SImode
>   || !register_operand (operands[0], E_DImode)
>   || GET_MODE (x2) != E_DImode)
> return -1;
>
> Does it reasonable ?
>
> Thanks.
>
> gcc/ChangeLog:
>
> * genrecog.cc (print_test): Add SUBREG_BYTE.to_constant () check.

I think instead we should revert the addition of to_constant.
See: https://inbox.sourceware.org/gcc-patches/mptedn4bwf7@arm.com/
(and earlier messages in that thread).

Thanks,
Richard

>
> ---
>  gcc/genrecog.cc | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/genrecog.cc b/gcc/genrecog.cc
> index 04a5533ca4b..28884ab3985 100644
> --- a/gcc/genrecog.cc
> +++ b/gcc/genrecog.cc
> @@ -4705,11 +4705,14 @@ print_test (output_state *os, const rtx_test &test, 
> bool is_param,
>break;
>  
>  case rtx_test::SUBREG_FIELD:
> +  printf ("(SUBREG_BYTE (");
> +  print_test_rtx (os, test);
> +  printf (").is_constant () && ");
>printf ("%s (", invert_p ? "maybe_ne" : "known_eq");
>print_nonbool_test (os, test);
>printf (", ");
>print_label_value (test, is_param, value);
> -  printf (")");
> +  printf ("))");
>break;
>  
>  case rtx_test::SAVED_CONST_INT:

Re: [PATCH] rs6000, add overloaded DFP quantize support

Hi Carl,

on 2023/8/9 23:52, Carl Love wrote:
> 
> GCC maintainers:
> 
> The following patch adds four built-ins for the decimal floating point
> (DFP) quantize instructions on rs6000.  The built-ins are for 64-bit
> and 128-bit DFP operands.
> 
> The patch also adds a test case for the new builtins.
> 
> The Patch has been tested on Power 10LE and Power 9 LE/BE.
> 
> Please let me know if the patch is acceptable for mainline.  Thanks.
> 
>  Carl Love
> 
> 
> --
> rs6000, add overloaded DFP quantize support
> 
> Add decimal floating point (DFP) quantize built-ins for both 64-bit DFP
> and 128-DFP operands.  In each case, there is an immediate version and a
> variable version of the bult-in.  The RM value is a 2-bit const int which

Nit: s/bult-in//built-in/

> specifies the rounding mode to use.  For the immediate versions of the
> built-in, TE field is a 5-bit constant that specifies the value of the
> ideal exponent for the result.  The built-in specifications are:
> 
>   __Decimal64 builtin_dfp_quantize (_Decimal64, _Decimal64,
>   const int RM)
>   __Decimal64 builtin_dfp_quantize (const int TE, _Decimal64,
>   const int)
>   __Decimal128 builtin_dfpq_quantize (_Decimal128, _Decimal128,
> const int RM)
>   __Decimal128 builtin_dfpq_quantize (const int TE, _Decimal128,
> const int)
> 

I noticed that the existing DFP bifs are directly using the insn
mnemonics, I perfer to keep consistent with them.  So could we
have one function like unique external interface like dfp_quantize
for users' uses?

And we can have the underlying instances for it:

__Decimal64 builtin_dfp_dqua (_Decimal64, _Decimal64, const int RM)

__Decimal64 builtin_dfp_dquai (const int TE, _Decimal64, const int)

__Decimal128 builtin_dfp_dquaq (_Decimal128, _Decimal128, const int RM)

__Decimal128 builtin_dfp_dquaiq (const int TE, _Decimal128, const int)

Besides, this patch missed to update the documentation, please add them
in gcc//doc/extend.texi by searching "The following built-in functions
are available when hardware decimal floating point".

> A testcase is added for the new built-in definitions.
> 
> gcc/ChangeLog:
>   * config/rs6000/dfp.md: New UNSPECDQUAN.
>   (dfp_quan_, dfp_quan_i): New define_insn.
>   * config/rs6000/rs6000-builtins.def (__builtin_dfp_quantize_64,
>   __builtin_dfp_quantize_64i, __builtin_dfp_quantize_128,
>   __builtin_dfp_quantize_128i): New buit-in definitions.
>   * config/rs6000/rs6000-overload.def (__builtin_dfp_quantize,
>   __builtin_dfpq_quantize): New overloaded definitions.
> 
> gcc/testsuite/
>* gcc.target/powerpc/builtin-dfp-quantize-runnable.c: New test
>   case.
> ---
>  gcc/config/rs6000/dfp.md  |  25 ++-
>  gcc/config/rs6000/rs6000-builtins.def |  15 ++
>  gcc/config/rs6000/rs6000-overload.def |  12 ++
>  .../powerpc/builtin-dfp-quantize-runnable.c   | 198 ++
>  4 files changed, 249 insertions(+), 1 deletion(-)
>  create mode 100644 
> gcc/testsuite/gcc.target/powerpc/builtin-dfp-quantize-runnable.c
> 
> diff --git a/gcc/config/rs6000/dfp.md b/gcc/config/rs6000/dfp.md
> index 5ed8a73ac51..254c22a5c20 100644
> --- a/gcc/config/rs6000/dfp.md
> +++ b/gcc/config/rs6000/dfp.md
> @@ -271,7 +271,8 @@
> UNSPEC_DIEX
> UNSPEC_DSCLI
> UNSPEC_DTSTSFI
> -   UNSPEC_DSCRI])
> +   UNSPEC_DSCRI
> +   UNSPEC_DQUAN])
>  
>  (define_code_iterator DFP_TEST [eq lt gt unordered])
>  
> @@ -395,3 +396,25 @@
>"dscri %0,%1,%2"
>[(set_attr "type" "dfp")
> (set_attr "size" "")])
> +
> +(define_insn "dfp_quan_"

s/quan/dqua/ to aglin with the others.

> +  [(set (match_operand:DDTD 0 "gpc_reg_operand" "=d")
> +(unspec:DDTD [(match_operand:DDTD 1 "gpc_reg_operand" "d")
> +   (match_operand:DDTD 2 "gpc_reg_operand" "d")
> +  (match_operand:QI 3 "immediate_operand" "i")]
> + UNSPEC_DQUAN))]
> +  "TARGET_DFP"
> +  "dqua %0,%1,%2,%3"
> +  [(set_attr "type" "dfp")
> +   (set_attr "size" "")])
> +
> +(define_insn "dfp_quan_i"

also s/quan_i/dquai_/

> +  [(set (match_operand:DDTD 0 "gpc_reg_operand" "=d")
> +(unspec:DDTD [(match_operand:SI 1 "const_int_operand" "n")
> +   (match_operand:DDTD 2 "gpc_reg_operand" "d")
> +  (match_operand:SI 3 "immediate_operand" "i")]
> + UNSPEC_DQUAN))]
> +  "TARGET_DFP"
> +  "dquai %1,%0,%2,%3"
> +  [(set_attr "type" "dfp")
> +   (set_attr "size" "")])
> diff --git a/gcc/config/rs6000/rs6000-builtins.def 
> b/gcc/config/rs6000/rs6000-builtins.def
> index 35c4cdf74c5..36a56311643 100644
> --- a/gcc/config/rs6000/rs6000-builtins.def
> +++ b/gcc/config/rs6000/rs6000-builtins.def
> @@ -2983,6 +2983,21 @@
>const unsigned long long __builtin_unpack_dec128 (_Decimal1

Re: Re: [PATCH] genrecog: Add SUBREG_BYTE.to_constant check to the genrecog

Thanks Richard.

It can fix my issue and reverted to the trunk.

Thanks.


juzhe.zh...@rivai.ai
 
From: Richard Sandiford
Date: 2023-08-14 18:00
To: Juzhe-Zhong
CC: gcc-patches; rguenther; jeffreyalaw
Subject: Re: [PATCH] genrecog: Add SUBREG_BYTE.to_constant check to the genrecog
Juzhe-Zhong  writes:
> Hi, there is genrecog issue happens in RISC-V backend.
>
> This is the ICE info:
>
> 0xfa3ba4 poly_int_pod<2u, unsigned short>::to_constant() const
> ../../../riscv-gcc/gcc/poly-int.h:504
> 0x28eaa91 recog_5
> ../../../riscv-gcc/gcc/config/riscv/bitmanip.md:314
> 0x28ec5b4 recog_7
> ../../../riscv-gcc/gcc/config/riscv/iterators.md:81
> 0x2a2e740 recog_436
> ../../../riscv-gcc/gcc/config/riscv/thead.md:265
> 0x2a729ef recog_475
> ../../../riscv-gcc/gcc/config/riscv/sync.md:509
> 0x2a75aec recog(rtx_def*, rtx_insn*, int*)
> ../../../riscv-gcc/gcc/config/riscv/iterators.md:55
> 0x2b3e39e recog_for_combine_1
> ../../../riscv-gcc/gcc/combine.cc:11382
> 0x2b3f457 recog_for_combine
> ../../../riscv-gcc/gcc/combine.cc:11652
> 0x2b25a15 try_combine
> ../../../riscv-gcc/gcc/combine.cc:4054
> 0x2b1d3f1 combine_instructions
> ../../../riscv-gcc/gcc/combine.cc:1266
> 0x2b48cfc rest_of_handle_combine
> ../../../riscv-gcc/gcc/combine.cc:15063
> 0x2b48db8 execute
> ../../../riscv-gcc/gcc/combine.cc:15107
>
> This is because the genrecog code here cause ICE for scalable vector in 
> RISC-V:
>
> Before this patch:
>
> static int
> recog_5 (rtx x1 ATTRIBUTE_UNUSED,
> rtx_insn *insn ATTRIBUTE_UNUSED,
> int *pnum_clobbers ATTRIBUTE_UNUSED)
> {
>   rtx * const operands ATTRIBUTE_UNUSED = &recog_data.operand[0];
>   rtx x2, x3, x4;
>   int res ATTRIBUTE_UNUSED;
>   if (pnum_clobbers == NULL)
> return -1;
>   x2 = XEXP (x1, 1);
>   x3 = XEXP (x2, 0);
>   if (maybe_ne (SUBREG_BYTE (x3).to_constant (), 0) ---> this code cause ICE.
>   || GET_MODE (x3) != E_SImode
>   || !register_operand (operands[0], E_DImode)
>   || GET_MODE (x2) != E_DImode)
> return -1;
> ...
>
> This ICE happens since we have following RTL IR:
>
> (insn 27 26 29 4 (set (reg:RVVM1HI 155 [ vect__12.23 ])
> (sign_extend:RVVM1HI (subreg:RVVMF2QI (reg:RVVMF2x2QI 146 [ 
> vect_array.19 ]) [8, 8]))) "auto.c":29:1 discrim 2 12570 
> {extendrvvmf2qirvvm1hi2}
>  (expr_list:REG_DEAD (reg:RVVMF2x2QI 146 [ vect_array.19 ])
> (nil)))
>
> This is the scalable vector with SUBREG_BYTE = poly (8, 8)
>
> After this patch:
>
> static int
> recog_5 (rtx x1 ATTRIBUTE_UNUSED,
> rtx_insn *insn ATTRIBUTE_UNUSED,
> int *pnum_clobbers ATTRIBUTE_UNUSED)
> {
>   rtx * const operands ATTRIBUTE_UNUSED = &recog_data.operand[0];
>   rtx x2, x3, x4;
>   int res ATTRIBUTE_UNUSED;
>   if (pnum_clobbers == NULL)
> return -1;
>   x2 = XEXP (x1, 1);
>   x3 = XEXP (x2, 0);
>   if ((SUBREG_BYTE (x3).is_constant () && maybe_ne (SUBREG_BYTE 
> (x3).to_constant (), 0))   > change here and fix ICE.
>   || GET_MODE (x3) != E_SImode
>   || !register_operand (operands[0], E_DImode)
>   || GET_MODE (x2) != E_DImode)
> return -1;
>
> Does it reasonable ?
>
> Thanks.
>
> gcc/ChangeLog:
>
> * genrecog.cc (print_test): Add SUBREG_BYTE.to_constant () check.
 
I think instead we should revert the addition of to_constant.
See: https://inbox.sourceware.org/gcc-patches/mptedn4bwf7@arm.com/
(and earlier messages in that thread).
 
Thanks,
Richard
 
>
> ---
>  gcc/genrecog.cc | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/genrecog.cc b/gcc/genrecog.cc
> index 04a5533ca4b..28884ab3985 100644
> --- a/gcc/genrecog.cc
> +++ b/gcc/genrecog.cc
> @@ -4705,11 +4705,14 @@ print_test (output_state *os, const rtx_test &test, 
> bool is_param,
>break;
>  
>  case rtx_test::SUBREG_FIELD:
> +  printf ("(SUBREG_BYTE (");
> +  print_test_rtx (os, test);
> +  printf (").is_constant () && ");
>printf ("%s (", invert_p ? "maybe_ne" : "known_eq");
>print_nonbool_test (os, test);
>printf (", ");
>print_label_value (test, is_param, value);
> -  printf (")");
> +  printf ("))");
>break;
>  
>  case rtx_test::SAVED_CONST_INT:

Re: [PATCH v1 1/6] LoongArch: a symmetric multilib subdir layout

On Mon, 2023-08-14 at 18:18 +0800, Yujie Yang wrote:
> On Mon, Aug 14, 2023 at 03:48:53PM +0800, Xi Ruoyao wrote:
> > On Mon, 2023-08-14 at 15:37 +0800, Yujie Yang wrote:
> > > On Mon, Aug 14, 2023 at 01:38:40PM +0800, Xi Ruoyao wrote:
> > > > On Mon, 2023-08-14 at 11:57 +0800, Yang Yujie wrote:
> > > > 
> > > > > However, for LoongArch, we do not want such a "toplevel" library
> > > > > installation since the default ABI may change.  We expect all
> > > > > multilib variants of libraries to be installed to their designated
> > > > > ABI-specific subdirs (e.g. base/lp64d) of the GCC libdir, so that
> > > > > the default ABI can be configured arbitrarily (with --with-abi)
> > > > > while the gcc libdir layout stays consistent.  This could be
> > > > > helpful for the distribution packaging of GCC libraries.
> > > > 
> > > > Have you tested a --disable-multilib configuration?  To me with --
> > > > disable-configuration everything should be still in the toplevel
> > > > directory, not any sub-directory.
> > > 
> > > That's a good point, sorry I missed --disable-multilib here.
> > > 
> > > However, you don't really need --disable-multilib since
> > > the libraries are only built once in the default ABI configuration
> > > as long as --with-multilib-list does not request anything more than
> > > that.
> > > 
> > > Maybe we should force-enabling multilib in all cases.
> > 
> > I really don't like this.  Why must I always remind my self "hey, this
> > is LoongArch, there is a different directory layout" when I don't need
> > multilib at all?
> > 
> 
> AFAIK, the two main uses of the multisubdir layout are in the C++
> header directory and the GCC libdir (where libgcc.a resides), respectively.
> The GCC libdir is fine since they are private to a user's GCC build.
> However, the C++ header directory is shared across the system unless
> an alternative sysroot is chosen, so the consisentency of the multilib
> layout matters.

The C++ header directory should also be considered private to the GCC
build.  AFAIK no distro supports "overwriting a part of the system", so
you cannot just install a custom GCC build and overwrite the system C++
header directory.  For a cross compiler, the C++ header directory is
$prefix/$target_triple/include/c++/$gcc_version/$multi_dir, the C++
header in $sysroot/usr/include/c++ (if it ever exists) will not be used
at all.

> So theoretically, the toplevel libraries should have the same ABI under
> the the target triplet.  However, for many architectures, the
> "--with-abi + MULTILIB_DEFAULT" scheme may cause the toplevel to be
> configured to have different meanings.

https://gcc.gnu.org/PR104085 is an example of the issue caused by the
different meaning.

> So I think it's also a reasonable approach that we just simply eliminate
> the ambiguous toplevel libraries and use a symmetric layout instead.

I don't like the inconsistency among different GCC ports.  If all ports
use the same approach I'll not object.

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University

[PATCH v2] In the pipeline, USE or CLOBBER should delay execution if it starts a new live range.

2023-08-14 Thread Jin Ma via Gcc-patches

CLOBBER and USE does not represent real instructions, but in the
process of pipeline optimization, they will wait for transmission
in ready list like other insns, without considering resource
conflicts and cycles. This results in a multi-issue CPU architecture
that can be issued at any time if other regular insns have resource
conflicts or cannot be launched for other reasons. As a result,
its position is advanced in the generated insns sequence, which
will affect register allocation and often lead to more redundant
mov instructions.

A simple example:
https://github.com/majin2020/gcc-test/blob/master/test.c
This is a function in the dhrystone benchmark.

https://github.com/majin2020/gcc-test/blob/0b08c1a13de9663d7d9aba7539b960ec0607ca24/test.c.299r.sched1
This is a log of the pass 'sched1' When -mtune=rocket but issue_rate == 2.

The pipeline is:
;; | insn | prio |
;; |  17  |  3   | r142=a0 alu
;; |  14  |  0   | clobber r136 nothing
;; |  13  |  0   | clobber a0 nothing
;; |  18  |  2   | r143=a1 alu
...
;; |  12  |  0   | a0=r136 alu
;; |  15  |  0   | use a0 nothing

In this log, insn 13 and 14 are much ahead of schedule, which risks generating
redundant mov instructions, which seems unreasonable.

Therefore, I submit patch again on the basis of the last review
opinions to try to solve this problem.

https://github.com/majin2020/gcc-test/commit/efcb43e3369e771bde702955048bfe3f501263dd#diff-805031b1be5092a2322852a248d0b0f92eef7cad5784a8209f4dfc6221407457L189
This is the diff log of shed1 after patch is added.

The new pipeline is:
;; | insn | prio |
;; |  17  |  3   | r142=a0 alu
...
;; |  10  |  0   | [r144]=r141 alu
;; |  13  |  0   | clobber a0 nothing
;; |  14  |  0   | clobber r136 nothing
;; |  12  |  0   | a0=r136 alu
;; |  15  |  0   | use a0 nothing

gcc/ChangeLog:
* haifa-sched.cc (use_or_clobber_starts_range_p): New.
(prune_ready_list): USE or CLOBBER should delay execution
if it starts a new live range.
---
 gcc/haifa-sched.cc | 55 +-
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/gcc/haifa-sched.cc b/gcc/haifa-sched.cc
index 8e8add709b3..47ad09457c7 100644
--- a/gcc/haifa-sched.cc
+++ b/gcc/haifa-sched.cc
@@ -765,6 +765,23 @@ real_insn_for_shadow (rtx_insn *insn)
   return pair->i1;
 }
 
+/* Return TRUE if INSN (a USE or CLOBBER) starts a new live
+range, FALSE otherwise.  */
+
+static bool
+use_or_clobber_starts_range_p (rtx_insn *insn)
+{
+  gcc_assert (insn);
+
+  if ((GET_CODE (PATTERN (insn)) == CLOBBER
+   || GET_CODE (PATTERN (insn)) == USE)
+  && !sd_lists_empty_p (insn, SD_LIST_FORW)
+  && sd_lists_empty_p (insn, SD_LIST_BACK))
+return true;
+
+  return false;
+}
+
 /* For a pair P of insns, return the fixed distance in cycles from the first
insn after which the second must be scheduled.  */
 static int
@@ -6320,11 +6337,39 @@ prune_ready_list (state_t temp_state, bool 
first_cycle_insn_p,
}
  else if (recog_memoized (insn) < 0)
{
- if (!first_cycle_insn_p
- && (GET_CODE (PATTERN (insn)) == ASM_INPUT
- || asm_noperands (PATTERN (insn)) >= 0))
-   cost = 1;
- reason = "asm";
+ if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+ || asm_noperands (PATTERN (insn)) >= 0)
+   {
+ reason = "asm";
+ if (!first_cycle_insn_p)
+   cost = 1;
+   }
+ else if (use_or_clobber_starts_range_p (insn))
+   {
+ /* If USE or CLOBBER opens an active range, its execution 
should
+be delayed so as to be closer to the relevant instructions 
and
+avoid the generation of some redundant mov instructions.
+Otherwise, it should be executed as soon as possible.  */
+ reason = "unrecog insn";
+ if (!first_cycle_insn_p)
+   /* If USE or CLOBBER is not in the first cycle, simply 
delay it
+  by one cycle.  */
+   cost = 1;
+ else
+   {
+ /* If the USE or CLOBBER is in the first cycle and there 
are no
+other non-USE or non-CLOBBER instructions after it, we 
need
+to execute it immediately, otherwise we need to 
execute the
+non-USE or non-CLOBBER instructions first and postpone 
the
+execution of the USE or CLOBBER instructions.  */
+ int j = i;
+ while (n > ++j)
+   if (!use_or_clobber_starts_range_p (ready_element 
(&ready, j)))
+ break;
+
+ cost = (j == n) ? 0 : 1;
+   }
+   }
}
  else if (sched_pressure != SCHED_PRESSURE_NONE)

Re: [PATCH] vect: Remove several useless VMAT_INVARIANT checks

On Mon, Aug 14, 2023 at 10:52 AM Kewen.Lin  wrote:
>
> Hi,
>
> In function vectorizable_load, there is one hunk which is
> dedicated for the handlings on VMAT_INVARIANT and return
> early, it means we shouldn't encounter any cases with
> memory_access_type VMAT_INVARIANT in the following code
> after that.  This patch is to clean up several useless
> checks on VMAT_INVARIANT.  There should be no functional
> changes.
>
> Bootstrapped and regtested on x86_64-redhat-linux,
> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

OK.

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vectorizable_load): Remove some useless checks
> on VMAT_INVARIANT.
> ---
>  gcc/tree-vect-stmts.cc | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 89607a98f99..d4e781531fd 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -10499,7 +10499,7 @@ vectorizable_load (vec_info *vinfo,
>   tree bias = NULL_TREE;
>   if (!costing_p)
> {
> - if (loop_masks && memory_access_type != VMAT_INVARIANT)
> + if (loop_masks)
> final_mask
>   = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> vec_num * ncopies, vectype,
> @@ -10729,7 +10729,7 @@ vectorizable_load (vec_info *vinfo,
> bias = build_int_cst (intQI_type_node, biasval);
>   }
>
> -   if (final_len && memory_access_type != VMAT_INVARIANT)
> +   if (final_len)
>   {
> tree ptr
>   = build_int_cst (ref_type, align * BITS_PER_UNIT);
> --
> 2.31.1

Re: [PATCH] vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest

On Mon, Aug 14, 2023 at 10:54 AM Kewen.Lin  wrote:
>
> Hi,
>
> Following Richi's suggestion [1], this patch is to move the
> handlings on VMAT_LOAD_STORE_LANES in the final loop nest
> of function vectorizable_load to its own loop.  Basically
> it duplicates the final loop nest, clean up some useless
> set up code for the case of VMAT_LOAD_STORE_LANES, remove
> some unreachable code.  Also remove the corresponding
> handlings in the final loop nest.
>
> Bootstrapped and regtested on x86_64-redhat-linux,
> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

OK (I guess the big diff is mostly because of re-indenting).

Thanks,
Richard.

> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>
> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
> VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
> and update the final nest accordingly.
> ---
>  gcc/tree-vect-stmts.cc | 1275 
>  1 file changed, 634 insertions(+), 641 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 4f2d088484c..c361e16cb7b 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo,
> vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
>&vec_masks, mask_vectype);
>  }
> +
>tree vec_mask = NULL_TREE;
> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
> +{
> +  gcc_assert (alignment_support_scheme == dr_aligned
> + || alignment_support_scheme == dr_unaligned_supported);
> +  gcc_assert (grouped_load && !slp);
> +
> +  unsigned int inside_cost = 0, prologue_cost = 0;
> +  for (j = 0; j < ncopies; j++)
> +   {
> + if (costing_p)
> +   {
> + /* An IFN_LOAD_LANES will load all its vector results,
> +regardless of which ones we actually need.  Account
> +for the cost of unused results.  */
> + if (first_stmt_info == stmt_info)
> +   {
> + unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
> + stmt_vec_info next_stmt_info = first_stmt_info;
> + do
> +   {
> + gaps -= 1;
> + next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
> +   }
> + while (next_stmt_info);
> + if (gaps)
> +   {
> + if (dump_enabled_p ())
> +   dump_printf_loc (MSG_NOTE, vect_location,
> +"vect_model_load_cost: %d "
> +"unused vectors.\n",
> +gaps);
> + vect_get_load_cost (vinfo, stmt_info, gaps,
> + alignment_support_scheme,
> + misalignment, false, &inside_cost,
> + &prologue_cost, cost_vec, cost_vec,
> + true);
> +   }
> +   }
> + vect_get_load_cost (vinfo, stmt_info, 1, 
> alignment_support_scheme,
> + misalignment, false, &inside_cost,
> + &prologue_cost, cost_vec, cost_vec, true);
> + continue;
> +   }
> +
> + /* 1. Create the vector or array pointer update chain.  */
> + if (j == 0)
> +   dataref_ptr
> + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> + at_loop, offset, &dummy, gsi,
> + &ptr_incr, false, bump);
> + else
> +   {
> + gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, 
> gsi,
> +stmt_info, bump);
> +   }
> + if (mask)
> +   vec_mask = vec_masks[j];
> +
> + tree vec_array = create_vector_array (vectype, vec_num);
> +
> + tree final_mask = NULL_TREE;
> + if (loop_masks)
> +   final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> +ncopies, vectype, j);
> + if (vec_mask)
> +   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, 
> final_mask,
> +  vec_mask, gsi);
> +
> + gcall *call;
> + if (final_mask)
> +   {
> + /* Emit:
> +  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> +   VEC_MASK).  */
> + unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +

Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

On Mon, 14 Aug 2023, juzhe.zh...@rivai.ai wrote:

> -   != CODE_FOR_nothing)
> + != CODE_FOR_nothing
> +   && can_vec_extract_var_idx_p (
> + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE 
> (vectype
> 
> I think maybe 'can_vec_extract_var_idx_p' check is enough and remove 
> convert_optab_handler (vec_extract_optab,... check.
> Looking forward Richi's more comments.

Yes, I think can_vec_extract_var_idx_p already does that so no need to
duplicate it here.

Richard.

> Thanks.
> 
> 
> juzhe.zh...@rivai.ai
>  
> From: Kewen.Lin
> Date: 2023-08-14 17:19
> To: Robin Dapp
> CC: gcc-patches; richard.sandiford; rguenther; juzhe.zh...@rivai.ai
> Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST 
> vectorization
> Hi Robin,
>  
> on 2023/8/14 16:58, Robin Dapp wrote:
> > Hi Kewen,
> > 
> >> I did a bootstrapping and regression testing on Power10 (LE) and found a 
> >> lot of failures.
> > 
> > I think the problem is that just like for vec_set we're expecting
> > the vec_extract expander not to fail.  It is probably passed not a
> > const int here anymore and therefore fails to expand?
>  
> Thanks for the comments!  Yeah, I think the expectation doesn't hold
> on Power, as our vec_extract optab only support const index, that
> is:
>  
> (define_expand "vec_extract"
>   [(match_operand: 0 "register_operand")
>(match_operand:VEC_E 1 "vlogical_operand")
>(match_operand 2 "const_int_operand")]
>   "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
> {
>   rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
>   DONE;
> })
>  
> > 
> > can_vec_extract_var_idx_p is supposed to check if the backend
> > supports extracting a variable index.
>  
> OK, it sounds that this new capability needs to further check with
> function can_vec_extract_var_idx_p to ensure the ifn expanding work
> as expected.  I re-spined by adding the below as your comments:
>  
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 07f3717ed9d..80ba5cae84a 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>else if (convert_optab_handler (vec_extract_optab,
>TYPE_MODE (vectype),
>TYPE_MODE (TREE_TYPE 
> (vectype)))
> -   != CODE_FOR_nothing)
> + != CODE_FOR_nothing
> +   && can_vec_extract_var_idx_p (
> + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE 
> (vectype
>  vect_record_loop_len (loop_vinfo,
>&LOOP_VINFO_LENS (loop_vinfo),
>1, vectype, 1);
>  
> BR,
> Kewen
>  
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

Hi, Kewin.

Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the 
testing?

Thanks.


juzhe.zh...@rivai.ai
 
From: Kewen.Lin
Date: 2023-08-14 17:19
To: Robin Dapp
CC: gcc-patches; richard.sandiford; rguenther; juzhe.zh...@rivai.ai
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST 
vectorization
Hi Robin,
 
on 2023/8/14 16:58, Robin Dapp wrote:
> Hi Kewen,
> 
>> I did a bootstrapping and regression testing on Power10 (LE) and found a lot 
>> of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?
 
Thanks for the comments!  Yeah, I think the expectation doesn't hold
on Power, as our vec_extract optab only support const index, that
is:
 
(define_expand "vec_extract"
  [(match_operand: 0 "register_operand")
   (match_operand:VEC_E 1 "vlogical_operand")
   (match_operand 2 "const_int_operand")]
  "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
{
  rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
  DONE;
})
 
> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.
 
OK, it sounds that this new capability needs to further check with
function can_vec_extract_var_idx_p to ensure the ifn expanding work
as expected.  I re-spined by adding the below as your comments:
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..80ba5cae84a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
   else if (convert_optab_handler (vec_extract_optab,
   TYPE_MODE (vectype),
   TYPE_MODE (TREE_TYPE (vectype)))
-   != CODE_FOR_nothing)
+ != CODE_FOR_nothing
+   && can_vec_extract_var_idx_p (
+ TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype
 vect_record_loop_len (loop_vinfo,
   &LOOP_VINFO_LENS (loop_vinfo),
   1, vectype, 1);
 
BR,
Kewen

[PATCH] RISC-V: Support MASK_LEN_{LOAD_LANES,STORE_LANES}

2023-08-14 Thread Juzhe-Zhong

This patch is depending on middle-end support:
https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627305.html

This patch allow us auto-vectorize this following case:

#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
  void __attribute__ ((noinline, noclone)) \
  NAME##_8 (OUTTYPE *__restrict dest, INTYPE *__restrict src,  \
MASKTYPE *__restrict cond, intptr_t n) \
  {\
for (intptr_t i = 0; i < n; ++i)   \
  if (cond[i]) \
dest[i] = (src[i * 8] + src[i * 8 + 1] + src[i * 8 + 2]\
   + src[i * 8 + 3] + src[i * 8 + 4] + src[i * 8 + 5]  \
   + src[i * 8 + 6] + src[i * 8 + 7]); \
  }

#define TEST2(NAME, OUTTYPE, INTYPE)   \
  TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, int32_t)  
 \

#define TEST1(NAME, OUTTYPE)   \
  TEST2 (NAME##_i32, OUTTYPE, int32_t) \

#define TEST(NAME) \
  TEST1 (NAME##_i32, int32_t)  \

TEST (test)

ASM:

test_i32_i32_f32_8:
ble a3,zero,.L5
.L3:
vsetvli a4,a3,e8,mf4,ta,ma
vle32.v v0,0(a2)
vsetvli a5,zero,e32,m1,ta,ma
vmsne.viv0,v0,0
vsetvli zero,a4,e32,m1,ta,ma
vlseg8e32.v v8,(a1),v0.t
vsetvli a5,zero,e32,m1,ta,ma
sllia6,a4,2
vadd.vv v1,v9,v8
sllia7,a4,5
vadd.vv v1,v1,v10
sub a3,a3,a4
vadd.vv v1,v1,v11
vadd.vv v1,v1,v12
vadd.vv v1,v1,v13
vadd.vv v1,v1,v14
vadd.vv v1,v1,v15
vsetvli zero,a4,e32,m1,ta,ma
vse32.v v1,0(a0),v0.t
add a2,a2,a6
add a1,a1,a7
add a0,a0,a6
bne a3,zero,.L3
.L5:
ret

gcc/ChangeLog:

* config/riscv/autovec.md (vec_mask_len_load_lanes): New 
pattern.
(vec_mask_len_store_lanes): Ditto.
(2): Fix pattern for ICE.
(2): Ditto.
* config/riscv/riscv-protos.h (expand_lanes_load_store): New function.
* config/riscv/riscv-v.cc (get_mask_mode): Add tuple mode mask mode.
(expand_lanes_load_store): New function.
* config/riscv/vector-iterators.md: New iterator.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-2.c: Adapt 
tests.
* gcc.target/riscv/rvv/autovec/partial/slp-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/partial/slp-4.c: Ditto.
* gcc.target/riscv/rvv/rvv.exp: Add lanes test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load-1.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load-2.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load-3.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load-4.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load-5.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load-6.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load-7.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load_run-1.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load_run-2.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load_run-3.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load_run-4.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load_run-5.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load_run-6.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_load_run-7.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store-1.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store-2.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store-3.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store-4.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store-5.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store-6.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store-7.c: New test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store_run-1.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store_run-2.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store_run-3.c: New 
test.
* gcc.target/riscv/rvv/autovec/struct/mask_struct_store_run-4.c: New 
test.

Re: [PATCH] vect: Move VMAT_GATHER_SCATTER handlings from final loop nest

2023-08-14 Thread Richard Sandiford via Gcc-patches

Thanks for the clean-ups.  But...

"Kewen.Lin"  writes:
> Hi,
>
> Following Richi's suggestion [1], this patch is to move the
> handlings on VMAT_GATHER_SCATTER in the final loop nest
> of function vectorizable_load to its own loop.  Basically
> it duplicates the final loop nest, clean up some useless
> set up code for the case of VMAT_GATHER_SCATTER, remove some
> unreachable code.  Also remove the corresponding handlings
> in the final loop nest.
>
> Bootstrapped and regtested on x86_64-redhat-linux,
> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>
> Is it ok for trunk?
>
> BR,
> Kewen
> -
>
> gcc/ChangeLog:
>
>   * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>   VMAT_GATHER_SCATTER in the final loop nest to its own loop,
>   and update the final nest accordingly.
> ---
>  gcc/tree-vect-stmts.cc | 361 +
>  1 file changed, 219 insertions(+), 142 deletions(-)

...that seems like quite a lot of +s.  Is there nothing we can do to
avoid the cut-&-paste?

Richard

>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index c361e16cb7b..5e514eca19b 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -10455,6 +10455,218 @@ vectorizable_load (vec_info *vinfo,
>return true;
>  }
>
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
> +{
> +  gcc_assert (alignment_support_scheme == dr_aligned
> +   || alignment_support_scheme == dr_unaligned_supported);
> +  gcc_assert (!grouped_load && !slp_perm);
> +
> +  unsigned int inside_cost = 0, prologue_cost = 0;
> +  for (j = 0; j < ncopies; j++)
> + {
> +   /* 1. Create the vector or array pointer update chain.  */
> +   if (j == 0 && !costing_p)
> + {
> +   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
> +  slp_node, &gs_info, &dataref_ptr,
> +  &vec_offsets);
> +   else
> + dataref_ptr
> +   = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> +   at_loop, offset, &dummy, gsi,
> +   &ptr_incr, false, bump);
> + }
> +   else if (!costing_p)
> + {
> +   gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> +   if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> +gsi, stmt_info, bump);
> + }
> +
> +   if (mask && !costing_p)
> + vec_mask = vec_masks[j];
> +
> +   gimple *new_stmt = NULL;
> +   for (i = 0; i < vec_num; i++)
> + {
> +   tree final_mask = NULL_TREE;
> +   tree final_len = NULL_TREE;
> +   tree bias = NULL_TREE;
> +   if (!costing_p)
> + {
> +   if (loop_masks)
> + final_mask
> +   = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> + vec_num * ncopies, vectype,
> + vec_num * j + i);
> +   if (vec_mask)
> + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> +final_mask, vec_mask, gsi);
> +
> +   if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> +gsi, stmt_info, bump);
> + }
> +
> +   /* 2. Create the vector-load in the loop.  */
> +   unsigned HOST_WIDE_INT align;
> +   if (gs_info.ifn != IFN_LAST)
> + {
> +   if (costing_p)
> + {
> +   unsigned int cnunits = vect_nunits_for_cost (vectype);
> +   inside_cost
> + = record_stmt_cost (cost_vec, cnunits, scalar_load,
> + stmt_info, 0, vect_body);
> +   continue;
> + }
> +   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + vec_offset = vec_offsets[vec_num * j + i];
> +   tree zero = build_zero_cst (vectype);
> +   tree scale = size_int (gs_info.scale);
> +
> +   if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
> + {
> +   if (loop_lens)
> + final_len
> +   = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +vec_num * ncopies, vectype,
> +vec_num * j + i, 1);
> +   else
> +

Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

Hi Juzhe,

on 2023/8/14 20:08, juzhe.zh...@rivai.ai wrote:
> Hi, Kewin.
> 
> Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass 
> the testing?

The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to 
the
previous v4, the only changes should be the proposed can_vec_extract_var_idx_p 
and
its required new includes as below:

+#include "memmodel.h"
+#include "optabs.h"
 
Could you have a double check?

Since I just tested it on Power10, you have the full ownership on the patch, 
I'd leave
the v5 posting to you.  Thanks!

BR,
Kewen
-
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bc3063c3615..5ae9f69c7eb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "ssa.h"
 #include "optabs-tree.h"
+#include "memmodel.h"
+#include "optabs.h"
 #include "diagnostic-core.h"
 #include "fold-const.h"
 #include "stor-layout.h"
@@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
   /* No transformation required.  */
   if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
- if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-  OPTIMIZE_FOR_SPEED))
-   {
- if (dump_enabled_p ())
-   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-"can't operate on partial vectors "
-"because the target doesn't support extract "
-"last reduction.\n");
- LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-   }
- else if (slp_node)
+ if (slp_node)
{
  if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
  else
{
  gcc_assert (ncopies == 1 && !slp_node);
- vect_record_loop_mask (loop_vinfo,
-&LOOP_VINFO_MASKS (loop_vinfo),
-1, vectype, NULL);
+ if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+ OPTIMIZE_FOR_SPEED))
+   vect_record_loop_mask (loop_vinfo,
+  &LOOP_VINFO_MASKS (loop_vinfo),
+  1, vectype, NULL);
+ else if (can_vec_extract_var_idx_p (
+TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype
+   vect_record_loop_len (loop_vinfo,
+ &LOOP_VINFO_LENS (loop_vinfo),
+ 1, vectype, 1);
+ else
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (
+ MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't operate on partial vectors "
+ "because the target doesn't support extract "
+ "last reduction.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+   }
}
}
   /* ???  Enable for loop costing as well.  */
@@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
   gimple *vec_stmt;
   if (slp_node)
 {
-  gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+  gcc_assert (!loop_vinfo
+ || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));

   /* Get the correct slp vectorized stmt.  */
   vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,

   gimple_seq stmts = NULL;
   tree new_tree;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+   {
+ /* Emit:
+
+  SCALAR_RES = VEC_EXTRACT 
+
+where VEC_LHS is the vectorized live-out result and MASK is
+the loop mask for the final iteration.  */
+ gcc_assert (ncopies == 1 && !slp_node);
+ gimple_seq tem = NULL;
+ gimple_stmt_iterator gsi = gsi_last (tem);
+ tree len
+   = vect_get_loop_len (loop_vinfo, &gsi,
+&LOOP_VINFO_LENS (loop_vinfo),
+1, vectype, 0, 0);
+
+ /* BIAS - 1.  */
+ signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ tree bias_minus_one
+   = int_const_binop (MINUS_EXPR,
+  build_int_cst (TREE_TYPE (len), biasval)

[PATCH v1] RISC-V: Support RVV VFREC7 rounding mode intrinsic API

2023-08-14 Thread Pan Li via Gcc-patches

From: Pan Li 

This patch would like to support the rounding mode API for the
VFREC7 as the below samples.

* __riscv_vfrec7_v_f32m1_rm
* __riscv_vfrec7_v_f32m1_rm_m

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class vfrec7_frm): New class for frm.
(vfrec7_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfrec7_frm): New intrinsic function definition.
* config/riscv/vector-iterators.md
(VFMISC): Remove VFREC7.
(misc_op): Ditto.
(float_insn_type): Ditto.
(VFMISC_FRM): New int iterator.
(misc_frm_op): New op for frm.
(float_frm_insn_type): New type for frm.
* config/riscv/vector.md (@pred_):
New pattern for misc frm.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-rec7.c: New test.
---
 .../riscv/riscv-vector-builtins-bases.cc  | 17 ++
 .../riscv/riscv-vector-builtins-bases.h   |  1 +
 .../riscv/riscv-vector-builtins-functions.def |  2 ++
 gcc/config/riscv/vector-iterators.md  | 12 +--
 gcc/config/riscv/vector.md| 23 ++
 .../riscv/rvv/base/float-point-rec7.c | 31 +++
 6 files changed, 83 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-rec7.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 2074dac0f16..249ac4e68cd 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -646,6 +646,21 @@ public:
   }
 };
 
+/* Implements below instructions for frm
+   - vfrec7
+*/
+template
+class vfrec7_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  rtx expand (function_expander &e) const override
+  {
+return e.use_exact_insn (code_for_pred (UNSPEC, e.vector_mode ()));
+  }
+};
+
 /* Implements vrsub.  */
 class vrsub : public function_base
 {
@@ -2433,6 +2448,7 @@ static CONSTEXPR const unop vfsqrt_obj;
 static CONSTEXPR const unop_frm vfsqrt_frm_obj;
 static CONSTEXPR const float_misc vfrsqrt7_obj;
 static CONSTEXPR const float_misc vfrec7_obj;
+static CONSTEXPR const vfrec7_frm vfrec7_frm_obj;
 static CONSTEXPR const binop vfmin_obj;
 static CONSTEXPR const binop vfmax_obj;
 static CONSTEXPR const float_misc vfsgnj_obj;
@@ -2681,6 +2697,7 @@ BASE (vfsqrt)
 BASE (vfsqrt_frm)
 BASE (vfrsqrt7)
 BASE (vfrec7)
+BASE (vfrec7_frm)
 BASE (vfmin)
 BASE (vfmax)
 BASE (vfsgnj)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 5c91381bd4c..2a9381eec5e 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -187,6 +187,7 @@ extern const function_base *const vfsqrt;
 extern const function_base *const vfsqrt_frm;
 extern const function_base *const vfrsqrt7;
 extern const function_base *const vfrec7;
+extern const function_base *const vfrec7_frm;
 extern const function_base *const vfmin;
 extern const function_base *const vfmax;
 extern const function_base *const vfsgnj;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index a821aca6a4b..34def6bb82f 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -396,6 +396,8 @@ DEF_RVV_FUNCTION (vfrsqrt7, alu, full_preds, f_v_ops)
 // 13.10. Vector Floating-Point Reciprocal Estimate Instruction
 DEF_RVV_FUNCTION (vfrec7, alu, full_preds, f_v_ops)
 
+DEF_RVV_FUNCTION (vfrec7_frm, alu_frm, full_preds, f_v_ops)
+
 // 13.11. Vector Floating-Point MIN/MAX Instructions
 DEF_RVV_FUNCTION (vfmin, alu, full_preds, f_vvv_ops)
 DEF_RVV_FUNCTION (vfmin, alu, full_preds, f_vvf_ops)
diff --git a/gcc/config/riscv/vector-iterators.md 
b/gcc/config/riscv/vector-iterators.md
index 30808ceb241..9dd611e254b 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -1867,7 +1867,9 @@ (define_int_iterator VSAT_SHIFT_OP [UNSPEC_VSSRL 
UNSPEC_VSSRA])
 
 (define_int_iterator VMISC [UNSPEC_VMSBF UNSPEC_VMSIF UNSPEC_VMSOF])
 
-(define_int_iterator VFMISC [UNSPEC_VFRSQRT7 UNSPEC_VFREC7])
+(define_int_iterator VFMISC [UNSPEC_VFRSQRT7])
+
+(define_int_iterator VFMISC_FRM [UNSPEC_VFREC7])
 
 (define_int_iterator VFCVTS [UNSPEC_VFCVT UNSPEC_UNSIGNED_VFCVT])
 
@@ -1890,9 +1892,13 @@ (define_int_attr sat_insn_type [(UNSPEC_VAADDU "vaalu") 
(UNSPEC_VAADD "vaalu")
(UNSPEC_VNCLIPU "vnclip")])
 
 (define_int_attr misc_op [(UNSPEC_VMSBF "sbf") (UNSPEC_VMSIF "sif") 
(UNSPEC_VMSOF "sof")
- (UNSPEC_VFRSQRT7 "rsqrt7") (UNSPEC_VFREC7 "rec7")])
+

Re: [PATCH] vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest

Hi Richi,

on 2023/8/14 20:04, Richard Biener wrote:
> On Mon, Aug 14, 2023 at 10:54 AM Kewen.Lin  wrote:
>>
>> Hi,
>>
>> Following Richi's suggestion [1], this patch is to move the
>> handlings on VMAT_LOAD_STORE_LANES in the final loop nest
>> of function vectorizable_load to its own loop.  Basically
>> it duplicates the final loop nest, clean up some useless
>> set up code for the case of VMAT_LOAD_STORE_LANES, remove
>> some unreachable code.  Also remove the corresponding
>> handlings in the final loop nest.
>>
>> Bootstrapped and regtested on x86_64-redhat-linux,
>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
> 
> OK (I guess the big diff is mostly because of re-indenting).

Thanks!  Yes, there is some code in the original final loop nest like

if (memory_access_type == VMAT_LOAD_STORE_LANES)
  {
...
  }
else
  {
...
  }

Then the else arm is fully re-indented.

The other patch on VMAT_GATHER_SCATTER looks a bit better since
it doesn't need re-indenting.

BR,
Kewen

> 
> Thanks,
> Richard.
> 
>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>>
>> gcc/ChangeLog:
>>
>> * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>> VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
>> and update the final nest accordingly.
>> ---
>>  gcc/tree-vect-stmts.cc | 1275 
>>  1 file changed, 634 insertions(+), 641 deletions(-)
>>
>> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
>> index 4f2d088484c..c361e16cb7b 100644
>> --- a/gcc/tree-vect-stmts.cc
>> +++ b/gcc/tree-vect-stmts.cc
>> @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo,
>> vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
>>&vec_masks, mask_vectype);
>>  }
>> +
>>tree vec_mask = NULL_TREE;
>> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
>> +{
>> +  gcc_assert (alignment_support_scheme == dr_aligned
>> + || alignment_support_scheme == dr_unaligned_supported);
>> +  gcc_assert (grouped_load && !slp);
>> +
>> +  unsigned int inside_cost = 0, prologue_cost = 0;
>> +  for (j = 0; j < ncopies; j++)
>> +   {
>> + if (costing_p)
>> +   {
>> + /* An IFN_LOAD_LANES will load all its vector results,
>> +regardless of which ones we actually need.  Account
>> +for the cost of unused results.  */
>> + if (first_stmt_info == stmt_info)
>> +   {
>> + unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
>> + stmt_vec_info next_stmt_info = first_stmt_info;
>> + do
>> +   {
>> + gaps -= 1;
>> + next_stmt_info = DR_GROUP_NEXT_ELEMENT 
>> (next_stmt_info);
>> +   }
>> + while (next_stmt_info);
>> + if (gaps)
>> +   {
>> + if (dump_enabled_p ())
>> +   dump_printf_loc (MSG_NOTE, vect_location,
>> +"vect_model_load_cost: %d "
>> +"unused vectors.\n",
>> +gaps);
>> + vect_get_load_cost (vinfo, stmt_info, gaps,
>> + alignment_support_scheme,
>> + misalignment, false, &inside_cost,
>> + &prologue_cost, cost_vec, cost_vec,
>> + true);
>> +   }
>> +   }
>> + vect_get_load_cost (vinfo, stmt_info, 1, 
>> alignment_support_scheme,
>> + misalignment, false, &inside_cost,
>> + &prologue_cost, cost_vec, cost_vec, true);
>> + continue;
>> +   }
>> +
>> + /* 1. Create the vector or array pointer update chain.  */
>> + if (j == 0)
>> +   dataref_ptr
>> + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
>> + at_loop, offset, &dummy, gsi,
>> + &ptr_incr, false, bump);
>> + else
>> +   {
>> + gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, 
>> gsi,
>> +stmt_info, bump);
>> +   }
>> + if (mask)
>> +   vec_mask = vec_masks[j];
>> +
>> + tree vec_array = create_vector_array (vectype, vec_num);
>> +
>> + tree final_mask = NULL_TREE;
>> + if (loop_masks)
>> +   final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>> +ncopies, vectype, j);
>> +

RE: [2/2] RISC-V: Constant FP Optimization with 'Zfa'

2023-08-14 Thread Jin Ma via Gcc-patches

Hi Tsukasa,
  What a coincidence, I also implemented zfa extension, which also includes fli 
related instructions :)

links: https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627294.html

> +  if (!TARGET_HARD_FLOAT || !TARGET_ZFA)
> +return result;
> +  switch (GET_MODE (x))
> +{
> +case HFmode:
> +  /* Not only 'Zfhmin', either 'Zfh' or 'Zvfh' is required.  */
> +  if (!TARGET_ZFH && !TARGET_ZVFH)

When Zvfh means that zfh is also on, so there may be no need to judge
the TARGET_ZVFH here. By the way,the format here seems wrong, maybe 'tab'
is needed for alignment?

> + return result;
> +  break;
> +case SFmode: break;
> +case DFmode: break;

Maybe we still have to judge TARGET_DOUBLE_FLOAT?

> +default: return result;
> +}
> +
> +  if (!CONST_DOUBLE_P (x))
> +return result;

I think it might be better to judge whether x satisfies the CONST_DOUBLE_P
before switch (GET_MODE (x)) above.

> +
> +  r = *CONST_DOUBLE_REAL_VALUE (x);
> +
> +  if (REAL_VALUE_ISNAN (r))
> +{
> +  long reprs[2] = { 0 };
> +  /* Compare with canonical NaN.  */
> +  switch (GET_MODE (x))
> + {
> + case HFmode:
> +   reprs[0] = real_to_target (NULL, &r,
> +  float_mode_for_size (16).require ());
> +   /* 0x7e00: Canonical NaN for binary16.  */
> +   if (reprs[0] != 0x7e00)
> + return result;
> +   break;
> + case SFmode:
> +   reprs[0] = real_to_target (NULL, &r,
> +  float_mode_for_size (32).require ());
> +   /* 0x7fc0: Canonical NaN for binary32.  */
> +   if (reprs[0] != 0x7fc0)
> + return result;
> +   break;
> + case DFmode:
> +   real_to_target (reprs, &r, float_mode_for_size (64).require ());
> +   if (FLOAT_WORDS_BIG_ENDIAN)
> + std::swap (reprs[0], reprs[1]);
> +   /* 0x7ff8_: Canonical NaN for binary64.  */
> +   if (reprs[0] != 0 || reprs[1] != 0x7ff8)
> + return result;
> +   break;
> + default:
> +   gcc_unreachable ();
> + }
> +  result.type = RISCV_FLOAT_CONST_NAN;
> +  result.valid = true;
> +  return result;
> +}
> +  else if (REAL_VALUE_ISINF (r))
> +{
> +  if (REAL_VALUE_NEGATIVE (r))
> + return result;
> +  result.type = RISCV_FLOAT_CONST_INF;
> +  result.valid = true;
> +  return result;
> +}
> +
> +  bool sign = REAL_VALUE_NEGATIVE (r);
> +  result.sign = sign;
> +
> +  r = real_value_abs (&r);
> +  /* GCC internally does not use IEEE754-like encoding (where normalized
> + significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
> + So, this exponent_p1 variable equals IEEE754 unbiased exponent + 1.  */
> +  int exponent_p1 = REAL_EXP (&r);
> +
> +  /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
> + highest (sign) bit, with a fixed binary point at bit point_pos.
> + m1 holds the low part of the mantissa, m2 the high part.
> + WARNING: If we ever have a representation using more than 2 * H_W_I - 1
> + bits for the mantissa, this can fail (low bits will be lost).  */
> +  bool fail = false;
> +  real_ldexp (&m, &r, (2 * HOST_BITS_PER_WIDE_INT - 1) - exponent_p1);
> +  wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
> +  if (fail)
> +return result;
> +
> +  /* If the low part of the mantissa has bits set we cannot represent
> + the value.  */
> +  if (w.ulow () != 0)
> +return result;
> +  /* We have rejected the lower HOST_WIDE_INT, so update our
> + understanding of how many bits lie in the mantissa and
> + look only at the high HOST_WIDE_INT.  */
> +  unsigned HOST_WIDE_INT mantissa = w.elt (1);
> +
> +  /* We cannot represent the value 0.0.  */
> +  if (mantissa == 0)
> +return result;
> +
> +  /* We can only represent values with a mantissa of the form 1.xx.  */
> +  unsigned HOST_WIDE_INT mask
> +  = ((unsigned HOST_WIDE_INT) 1 << (HOST_BITS_PER_WIDE_INT - 4)) - 1;
> +  if ((mantissa & mask) != 0)
> +return result;
> +  mantissa >>= HOST_BITS_PER_WIDE_INT - 4;
> +  /* Now the lowest 3-bits of mantissa should form (1.xx)b.  */
> +  gcc_assert (mantissa & (1u << 2));
> +  /* Mask out the highest bit.  */
> +  mantissa &= ~(1u << 2);
> +
> +  if (mantissa == 0)
> +{
> +  /* We cannot represent any values but -1.0.  */
> +  if (exponent_p1 != 1 && sign)
> + return result;
> +  switch (exponent_p1)
> + {
> + case -15: /* 1.0 * 2^(-16)  */
> + case -14: /* 1.0 * 2^(-15)  */
> + case -7:  /* 1.0 * 2^(- 8)  */
> + case -6:  /* 1.0 * 2^(- 7)  */
> + case 8:   /* 1.0 * 2^(+ 7)  */
> + case 9:   /* 1.0 * 2^(+ 8)  */
> + case 16:  /* 1.0 * 2^(+15)  */
> + case 17:  /* 1.0 * 2^(+16)  */
> +   break;
> + default:
> +   if (exponent_p1 >= -3 && exponent_p1 <= 5)
> + /* 1.0 * 2^[-4,4]  */
> + break;
> +   switch (GET

Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-14 Thread Richard Sandiford via Gcc-patches

Prathamesh Kulkarni  writes:
> On Thu, 10 Aug 2023 at 21:27, Richard Sandiford
>  wrote:
>>
>> Prathamesh Kulkarni  writes:
>> >> static bool
>> >> is_simple_vla_size (poly_uint64 size)
>> >> {
>> >>   if (size.is_constant ())
>> >> return false;
>> >>   for (int i = 1; i < ARRAY_SIZE (size.coeffs); ++i)
>> >> if (size[i] != (i <= 1 ? size[0] : 0))
>> > Just wondering is this should be (i == 1 ? size[0] : 0) since i is
>> > initialized to 1 ?
>>
>> Both work.  I prefer <= 1 because it doesn't depend on the micro
>> optimisation to start at coefficient 1.  In a theoretical 3-indeterminate
>> poly_int, we want the first 2 coefficients to be nonzero and the rest to
>> be zero.
>>
>> > IIUC, is_simple_vla_size should return true for polynomials of first
>> > degree and having same coeff like 4 + 4x ?
>>
>> FWIW, poly_int only supports first-degree polynomials at the moment.
>> coeffs>2 means there is more than one indeterminate, rather than a
>> higher power.
> Oh OK, thanks for the clarification.
>>
>> >>   return false;
>> >>   return true;
>> >> }
>> >>
>> >>
>> >>   FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
>> >> {
>> >>   auto nunits = GET_MODE_NUNITS (mode);
>> >>   if (!is_simple_vla_size (nunits))
>> >> continue;
>> >>   if (nunits[0] ...)
>> >> test_... (mode);
>> >>   ...
>> >>
>> >> }
>> >>
>> >> test_vnx4si_v4si and test_v4si_vnx4si look good.  But with the
>> >> loop structure above, I think we can apply the test_vnx4si and
>> >> test_vnx16qi to more cases.  So the classification isn't the
>> >> exact number of elements, but instead a limit.
>> >>
>> >> I think the nunits[0] conditions for test_vnx4si are as follows
>> >> (inspection only, so could be wrong):
>> >>
>> >> > +/* Test cases where result and input vectors are VNx4SI  */
>> >> > +
>> >> > +static void
>> >> > +test_vnx4si (machine_mode vmode)
>> >> > +{
>> >> > +  /* Case 1: mask = {0, ...} */
>> >> > +  {
>> >> > +tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
>> >> > +tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
>> >> > +poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
>> >> > +
>> >> > +vec_perm_builder builder (len, 1, 1);
>> >> > +builder.quick_push (0);
>> >> > +vec_perm_indices sel (builder, 2, len);
>> >> > +tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
>> >> > +
>> >> > +tree expected_res[] = { vector_cst_elt (res, 0) };
>> > This should be { vector_cst_elt (arg0, 0) }; will fix in next patch.
>> >> > +validate_res (1, 1, res, expected_res);
>> >> > +  }
>> >>
>> >> nunits[0] >= 2 (could be all nunits if the inputs had 
>> >> nelts_per_pattern==1,
>> >> which I think would be better)
>> > IIUC, the vectors that can be used for a particular test should have
>> > nunits[0] >= res_npatterns,
>> > where res_npatterns is as computed in fold_vec_perm_cst without the
>> > canonicalization ?
>> > For above test -- res_npatterns = max(2, max (2, 1)) == 2, so we
>> > require nunits[0] >= 2 ?
>> > Which implies we can use above test for vectors with length 2 + 2x, 4 + 
>> > 4x, etc.
>>
>> Right, that's what I meant.  With the inputs as they stand it has to be
>> nunits[0] >= 2.  We need that form the inputs correctly.  But if the
>> inputs instead had nelts_per_pattern == 1, the test would work for all
>> nunits.
> In the attached patch, I have reordered the tests based on min or max limit.
> For tests where sel_npatterns < 3 (ie dup sequence), I have kept input
> npatterns = 1,
> so we can test more vector modes, and also input npatterns matter only
> for stepped sequence in sel
> (Since for a dup pattern we don't enforce the constraint of selecting
> elements from same input pattern).
> Does it look OK ?
>
> For the following tests with input vectors having shape (1, 3)
> sel = {0, 1, 2, ...}  // (1, 3)
> res = { arg0[0], arg0[1], arg0[2], ... } // (1, 3)
>
> and sel = {len, len + 1, len + 2, ... }  // (1, 3)
> res = { arg1[0], arg1[1], arg1[2], ... } // (1, 3)
>
> Altho res_npatterns = 1, I suppose these will need to be tested with
> vectors with length >= 4 + 4x,
> since index 2 can be ambiguous for length 2 + 2x  ?
> (In the patch, these are cases 2 and 3 in test_nunits_min_4)

Ah, yeah, fair point.  I guess that means:

+  /* Case 3: mask = {len, 0, 1, ...} // (1, 3)
+Test that stepped sequence of the pattern selects from arg0.
+res = { arg1[0], arg0[0], arg0[1], ... } // (1, 3)  */
+  {
+   tree arg0 = build_vec_cst_rand (vmode, 1, 3, 1);
+   tree arg1 = build_vec_cst_rand (vmode, 1, 3, 1);
+   poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+   vec_perm_builder builder (len, 1, 3);
+   poly_uint64 mask_elems[] = { len, 0, 1 };
+   builder_push_elems (builder, mask_elems);
+
+   vec_perm_indices sel (builder, 2, len);
+   tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+   tree expected_res[] = { ARG1(0), ARG0(0)

Re: [PATCH] vect: Move VMAT_GATHER_SCATTER handlings from final loop nest

Hi Richard,

on 2023/8/14 20:20, Richard Sandiford wrote:
> Thanks for the clean-ups.  But...
> 
> "Kewen.Lin"  writes:
>> Hi,
>>
>> Following Richi's suggestion [1], this patch is to move the
>> handlings on VMAT_GATHER_SCATTER in the final loop nest
>> of function vectorizable_load to its own loop.  Basically
>> it duplicates the final loop nest, clean up some useless
>> set up code for the case of VMAT_GATHER_SCATTER, remove some
>> unreachable code.  Also remove the corresponding handlings
>> in the final loop nest.
>>
>> Bootstrapped and regtested on x86_64-redhat-linux,
>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>>
>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>>
>> Is it ok for trunk?
>>
>> BR,
>> Kewen
>> -
>>
>> gcc/ChangeLog:
>>
>>  * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>>  VMAT_GATHER_SCATTER in the final loop nest to its own loop,
>>  and update the final nest accordingly.
>> ---
>>  gcc/tree-vect-stmts.cc | 361 +
>>  1 file changed, 219 insertions(+), 142 deletions(-)
> 
> ...that seems like quite a lot of +s.  Is there nothing we can do to
> avoid the cut-&-paste?

Thanks for the comments!  I'm not sure if I get your question, if we
want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem
inevitable?  Your concern is mainly about git blame history?

BR,
Kewen

> 
> Richard
> 
>>
>> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
>> index c361e16cb7b..5e514eca19b 100644
>> --- a/gcc/tree-vect-stmts.cc
>> +++ b/gcc/tree-vect-stmts.cc
>> @@ -10455,6 +10455,218 @@ vectorizable_load (vec_info *vinfo,
>>return true;
>>  }
>>
>> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>> +{
>> +  gcc_assert (alignment_support_scheme == dr_aligned
>> +  || alignment_support_scheme == dr_unaligned_supported);
>> +  gcc_assert (!grouped_load && !slp_perm);
>> +
>> +  unsigned int inside_cost = 0, prologue_cost = 0;
>> +  for (j = 0; j < ncopies; j++)
>> +{
>> +  /* 1. Create the vector or array pointer update chain.  */
>> +  if (j == 0 && !costing_p)
>> +{
>> +  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
>> + slp_node, &gs_info, &dataref_ptr,
>> + &vec_offsets);
>> +  else
>> +dataref_ptr
>> +  = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
>> +  at_loop, offset, &dummy, gsi,
>> +  &ptr_incr, false, bump);
>> +}
>> +  else if (!costing_p)
>> +{
>> +  gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>> +  if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> +   gsi, stmt_info, bump);
>> +}
>> +
>> +  if (mask && !costing_p)
>> +vec_mask = vec_masks[j];
>> +
>> +  gimple *new_stmt = NULL;
>> +  for (i = 0; i < vec_num; i++)
>> +{
>> +  tree final_mask = NULL_TREE;
>> +  tree final_len = NULL_TREE;
>> +  tree bias = NULL_TREE;
>> +  if (!costing_p)
>> +{
>> +  if (loop_masks)
>> +final_mask
>> +  = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>> +vec_num * ncopies, vectype,
>> +vec_num * j + i);
>> +  if (vec_mask)
>> +final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>> +   final_mask, vec_mask, gsi);
>> +
>> +  if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> +   gsi, stmt_info, bump);
>> +}
>> +
>> +  /* 2. Create the vector-load in the loop.  */
>> +  unsigned HOST_WIDE_INT align;
>> +  if (gs_info.ifn != IFN_LAST)
>> +{
>> +  if (costing_p)
>> +{
>> +  unsigned int cnunits = vect_nunits_for_cost (vectype);
>> +  inside_cost
>> += record_stmt_cost (cost_vec, cnunits, scalar_load,
>> +stmt_info, 0, vect_body);
>> +  continue;
>> +}
>> +  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +vec_offset = vec_offsets[vec_num * j + i];
>> +  tree zero = build_zero_cst (vectype);
>> +  tree scale = size_int (gs_info.scale);
>> +
>> +  if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
>> +

Re: [RFC] GCC Security policy

2023-08-14 Thread Siddhesh Poyarekar

Hi,

Here's the updated draft of the top part of the security policy with all 
of the recommendations incorporated.


Thanks,
Sid


What is a GCC security bug?
===

A security bug is one that threatens the security of a system or
network, or might compromise the security of data stored on it.
In the context of GCC there are multiple ways in which this might
happen and they're detailed below.

Compiler drivers, programs, libgccjit and support libraries
---

The compiler driver processes source code, invokes other programs
such as the assembler and linker and generates the output result,
which may be assembly code or machine code.  It is necessary that
all source code inputs to the compiler are trusted, since it is
impossible for the driver to validate input source code beyond
conformance to a programming language standard.

The GCC JIT implementation, libgccjit, is intended to be plugged
into applications to translate input source code in the application
context.  Limitations that apply to the compiler
driver, apply here too in terms of sanitizing inputs, so it is
recommended that inputs are either sanitized by an external program
to allow only trusted, safe execution in the context of the
application or the JIT execution context is appropriately sandboxed
to contain the effects of any bugs in the JIT or its generated code
to the sandboxed environment.

Support libraries such as libiberty, libcc1 libvtv and libcpp have
been developed separately to share code with other tools such as
binutils and gdb.  These libraries again have similar challenges to
compiler drivers.  While they are expected to be robust against
arbitrary input, they should only be used with trusted inputs.

Libraries such as zlib that bundled into GCC to build it will be
treated the same as the compiler drivers and programs as far as
security coverage is concerned.  However if you find an issue in
these libraries independent of their use in GCC, you should reach
out to their upstream projects to report them.

As a result, the only case for a potential security issue in all
these cases is when it ends up generating vulnerable output for
valid input source code.

As a result, the only case for a potential security issue in the
compiler is when it generates vulnerable application code for
trusted input source code that is conforming to the relevant
programming standard or extensions documented as supported by GCC
and the algorithm expressed in the source code does not have the
vulnerability.  The output application code could be considered
vulnerable if it produces an actual vulnerability in the target
application, specifically in the following cases:

- The application dereferences an invalid memory location despite
  the application sources being valid.
- The application reads from or writes to a valid but incorrect
  memory location, resulting in an information integrity issue or an
  information leak.
- The application ends up running in an infinite loop or with
  severe degradation in performance despite the input sources having
  no such issue, resulting in a Denial of Service.  Note that
  correct but non-performant code is not a security issue candidate,
  this only applies to incorrect code that may result in performance
  degradation severe enough to amount to a denial of service.
- The application crashes due to the generated incorrect code,
  resulting in a Denial of Service.

Language runtime libraries
--

GCC also builds and distributes libraries that are intended to be
used widely to implement runtime support for various programming
languages.  These include the following:

* libada
* libatomic
* libbacktrace
* libcc1
* libcody
* libcpp
* libdecnumber
* libffi
* libgcc
* libgfortran
* libgm2
* libgo
* libgomp
* libiberty
* libitm
* libobjc
* libphobos
* libquadmath
* libsanitizer
* libssp
* libstdc++

These libraries are intended to be used in arbitrary contexts and as
a result, bugs in these libraries may be evaluated for security
impact.  However, some of these libraries, e.g. libgo, libphobos,
etc.  are not maintained in the GCC project, due to which the GCC
project may not be the correct point of contact for them.  You are
encouraged to look at README files within those library directories
to locate the canonical security contact point for those projects
and include them in the report.  Once the issue is fixed in the
upstream project, the fix will be synced into GCC in a future
release.

Most security vulnerabilities in these runtime libraries arise when
an application us

Re: [PATCH v1] RISC-V: Support RVV VFREC7 rounding mode intrinsic API

2023-08-14 Thread 钟居哲

I defer this patch's review to kito since I am not sure whether vfrec7 needs 
rounding mode.



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-08-14 20:49
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Support RVV VFREC7 rounding mode intrinsic API
From: Pan Li 
 
This patch would like to support the rounding mode API for the
VFREC7 as the below samples.
 
* __riscv_vfrec7_v_f32m1_rm
* __riscv_vfrec7_v_f32m1_rm_m
 
Signed-off-by: Pan Li 
 
gcc/ChangeLog:
 
* config/riscv/riscv-vector-builtins-bases.cc
(class vfrec7_frm): New class for frm.
(vfrec7_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfrec7_frm): New intrinsic function definition.
* config/riscv/vector-iterators.md
(VFMISC): Remove VFREC7.
(misc_op): Ditto.
(float_insn_type): Ditto.
(VFMISC_FRM): New int iterator.
(misc_frm_op): New op for frm.
(float_frm_insn_type): New type for frm.
* config/riscv/vector.md (@pred_):
New pattern for misc frm.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/base/float-point-rec7.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 17 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 ++
gcc/config/riscv/vector-iterators.md  | 12 +--
gcc/config/riscv/vector.md| 23 ++
.../riscv/rvv/base/float-point-rec7.c | 31 +++
6 files changed, 83 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-rec7.c
 
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 2074dac0f16..249ac4e68cd 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -646,6 +646,21 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfrec7
+*/
+template
+class vfrec7_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  rtx expand (function_expander &e) const override
+  {
+return e.use_exact_insn (code_for_pred (UNSPEC, e.vector_mode ()));
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2433,6 +2448,7 @@ static CONSTEXPR const unop vfsqrt_obj;
static CONSTEXPR const unop_frm vfsqrt_frm_obj;
static CONSTEXPR const float_misc vfrsqrt7_obj;
static CONSTEXPR const float_misc vfrec7_obj;
+static CONSTEXPR const vfrec7_frm vfrec7_frm_obj;
static CONSTEXPR const binop vfmin_obj;
static CONSTEXPR const binop vfmax_obj;
static CONSTEXPR const float_misc vfsgnj_obj;
@@ -2681,6 +2697,7 @@ BASE (vfsqrt)
BASE (vfsqrt_frm)
BASE (vfrsqrt7)
BASE (vfrec7)
+BASE (vfrec7_frm)
BASE (vfmin)
BASE (vfmax)
BASE (vfsgnj)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 5c91381bd4c..2a9381eec5e 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -187,6 +187,7 @@ extern const function_base *const vfsqrt;
extern const function_base *const vfsqrt_frm;
extern const function_base *const vfrsqrt7;
extern const function_base *const vfrec7;
+extern const function_base *const vfrec7_frm;
extern const function_base *const vfmin;
extern const function_base *const vfmax;
extern const function_base *const vfsgnj;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index a821aca6a4b..34def6bb82f 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -396,6 +396,8 @@ DEF_RVV_FUNCTION (vfrsqrt7, alu, full_preds, f_v_ops)
// 13.10. Vector Floating-Point Reciprocal Estimate Instruction
DEF_RVV_FUNCTION (vfrec7, alu, full_preds, f_v_ops)
+DEF_RVV_FUNCTION (vfrec7_frm, alu_frm, full_preds, f_v_ops)
+
// 13.11. Vector Floating-Point MIN/MAX Instructions
DEF_RVV_FUNCTION (vfmin, alu, full_preds, f_vvv_ops)
DEF_RVV_FUNCTION (vfmin, alu, full_preds, f_vvf_ops)
diff --git a/gcc/config/riscv/vector-iterators.md 
b/gcc/config/riscv/vector-iterators.md
index 30808ceb241..9dd611e254b 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -1867,7 +1867,9 @@ (define_int_iterator VSAT_SHIFT_OP [UNSPEC_VSSRL 
UNSPEC_VSSRA])
(define_int_iterator VMISC [UNSPEC_VMSBF UNSPEC_VMSIF UNSPEC_VMSOF])
-(define_int_iterator VFMISC [UNSPEC_VFRSQRT7 UNSPEC_VFREC7])
+(define_int_iterator VFMISC [UNSPEC_VFRSQRT7])
+
+(define_int_iterator VFMISC_FRM [UNSPEC_VFREC7])
(define_int_iterator VFCVTS [UNSPEC_VFCVT UNSPEC_UNSIGNED_VFCVT])
@@ -1890,9 +1892,13 @@ (define_int_attr sat_insn_type [(UNSPEC_VAADDU "vaalu") 
(UNSPEC_VAADD "vaalu")
(UNSPEC_VNCLIPU "vnclip")])
(define_int_attr misc_op [(UNSPEC_VMSBF "sbf") (UNSPEC_VMSIF "sif") 
(UNSPEC_VMSOF "sof")

Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

2023-08-14 Thread 钟居哲

Thanks Kewen.

But I saw there is 2 more files include:

+#include "memmodel.h"
+#include "optabs.h"

Not sure whether Richard and Richi ok with that change ?

Thanks.



juzhe.zh...@rivai.ai
 
From: Kewen.Lin
Date: 2023-08-14 20:45
To: juzhe.zh...@rivai.ai
CC: Robin Dapp; richard.sandiford; rguenther; GCC Patches
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST 
vectorization
Hi Juzhe,
 
on 2023/8/14 20:08, juzhe.zh...@rivai.ai wrote:
> Hi, Kewin.
> 
> Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass 
> the testing?
 
The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to 
the
previous v4, the only changes should be the proposed can_vec_extract_var_idx_p 
and
its required new includes as below:
 
+#include "memmodel.h"
+#include "optabs.h"
Could you have a double check?
 
Since I just tested it on Power10, you have the full ownership on the patch, 
I'd leave
the v5 posting to you.  Thanks!
 
BR,
Kewen
-
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bc3063c3615..5ae9f69c7eb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
#include "tree-pass.h"
#include "ssa.h"
#include "optabs-tree.h"
+#include "memmodel.h"
+#include "optabs.h"
#include "diagnostic-core.h"
#include "fold-const.h"
#include "stor-layout.h"
@@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
   /* No transformation required.  */
   if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
-   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-OPTIMIZE_FOR_SPEED))
- {
-   if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't operate on partial vectors "
- "because the target doesn't support extract "
- "last reduction.\n");
-   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
- }
-   else if (slp_node)
+   if (slp_node)
{
  if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
  else
{
  gcc_assert (ncopies == 1 && !slp_node);
-   vect_record_loop_mask (loop_vinfo,
-  &LOOP_VINFO_MASKS (loop_vinfo),
-  1, vectype, NULL);
+   if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+   OPTIMIZE_FOR_SPEED))
+ vect_record_loop_mask (loop_vinfo,
+&LOOP_VINFO_MASKS (loop_vinfo),
+1, vectype, NULL);
+   else if (can_vec_extract_var_idx_p (
+ TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype
+ vect_record_loop_len (loop_vinfo,
+   &LOOP_VINFO_LENS (loop_vinfo),
+   1, vectype, 1);
+   else
+ {
+   if (dump_enabled_p ())
+ dump_printf_loc (
+   MSG_MISSED_OPTIMIZATION, vect_location,
+   "can't operate on partial vectors "
+   "because the target doesn't support extract "
+   "last reduction.\n");
+   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
}
}
   /* ???  Enable for loop costing as well.  */
@@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
   gimple *vec_stmt;
   if (slp_node)
 {
-  gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+  gcc_assert (!loop_vinfo
+   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+   && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
 
   /* Get the correct slp vectorized stmt.  */
   vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
   gimple_seq stmts = NULL;
   tree new_tree;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ {
+   /* Emit:
+
+SCALAR_RES = VEC_EXTRACT 
+
+  where VEC_LHS is the vectorized live-out result and MASK is
+  the loop mask for the final iteration.  */
+   gcc_assert (ncopies == 1 && !slp_node);
+   gimple_seq tem = NULL;
+   gimple_stmt_iterator gsi = gsi_last (tem);
+   tree len
+ = vect_get_loop_len (loop_vinfo, &gsi,
+ &LOOP_VINFO_LENS (loop_vinfo),
+ 1, vectype, 0, 0);
+
+   /* BIAS - 1.  */
+   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+   tree bias_minus_one
+ = int_const_binop (MINUS_EXPR,
+build_int_cst (TREE_TYPE (len), biasval),
+build_one_cst (TREE_TYPE (len)));
+
+   /* LAST_INDEX = LEN + (BIAS - 1).  */
+   tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+   len, bias_minus_one);
+
+   /* SCALAR_RES = VEC_EXTRACT .  */
+   tree scalar_res
+ = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+ vec_lhs_phi, last_index);
+
+   /* Convert the extracted vector element to the scalar type.  */
+   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+ }
+

[PATCH] tree-optimization/110991 - unroll size estimate after vectorization

The following testcase shows that we are bad at identifying inductions
that will be optimized away after vectorizing them because SCEV doesn't
handle vectorized defs.  The following rolls a simpler identification
of SSA cycles covering a PHI and an assignment with a binary operator
with a constant second operand.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

Note, I also have a more general approach (will reply to this mail
with an RFC).

Any comments on this particular change?

PR tree-optimization/110991
* tree-ssa-loop-ivcanon.cc (constant_after_peeling): Handle
VIEW_CONVERT_EXPR , handle more simple IV-like SSA cycles
that will end up constant.

* gcc.dg/tree-ssa/cunroll-16.c: New testcase.
---
 gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c | 17 
 gcc/tree-ssa-loop-ivcanon.cc   | 46 +-
 2 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c 
b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c
new file mode 100644
index 000..9bb66ff8299
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c
@@ -0,0 +1,17 @@
+/* PR/110991 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cunroll-details -fdump-tree-optimized" } */
+
+static unsigned char a;
+static signed char b;
+void foo(void);
+int main() {
+  a = 25;
+  for (; a > 13; --a)
+b = a > 127 ?: a << 3;
+  if (!b)
+foo();
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop with \[0-9\]\+ iterations 
completely unrolled" "cunroll" } } */
+/* { dg-final { scan-tree-dump-not "foo" "optimized" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index a895e8e65be..99e50ee2efe 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -166,6 +166,11 @@ constant_after_peeling (tree op, gimple *stmt, class loop 
*loop)
   if (CONSTANT_CLASS_P (op))
 return true;
 
+  /* Get at the actual SSA operand.  */
+  if (handled_component_p (op)
+  && TREE_CODE (TREE_OPERAND (op, 0)) == SSA_NAME)
+op = TREE_OPERAND (op, 0);
+
   /* We can still fold accesses to constant arrays when index is known.  */
   if (TREE_CODE (op) != SSA_NAME)
 {
@@ -198,7 +203,46 @@ constant_after_peeling (tree op, gimple *stmt, class loop 
*loop)
   tree ev = analyze_scalar_evolution (loop, op);
   if (chrec_contains_undetermined (ev)
   || chrec_contains_symbols (ev))
-return false;
+{
+  if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (op)))
+   {
+ gassign *ass = nullptr;
+ gphi *phi = nullptr;
+ if (is_a  (SSA_NAME_DEF_STMT (op)))
+   {
+ ass = as_a  (SSA_NAME_DEF_STMT (op));
+ if (TREE_CODE (gimple_assign_rhs1 (ass)) == SSA_NAME)
+   phi = dyn_cast 
+   (SSA_NAME_DEF_STMT (gimple_assign_rhs1  (ass)));
+   }
+ else if (is_a  (SSA_NAME_DEF_STMT (op)))
+   {
+ phi = as_a  (SSA_NAME_DEF_STMT (op));
+ if (gimple_bb (phi) == loop->header)
+   {
+ tree def = gimple_phi_arg_def_from_edge
+   (phi, loop_latch_edge (loop));
+ if (TREE_CODE (def) == SSA_NAME
+ && is_a  (SSA_NAME_DEF_STMT (def)))
+   ass = as_a  (SSA_NAME_DEF_STMT (def));
+   }
+   }
+ if (ass && phi)
+   {
+ tree rhs1 = gimple_assign_rhs1 (ass);
+ if (gimple_assign_rhs_class (ass) == GIMPLE_BINARY_RHS
+ && CONSTANT_CLASS_P (gimple_assign_rhs2 (ass))
+ && rhs1 == gimple_phi_result (phi)
+ && gimple_bb (phi) == loop->header
+ && (gimple_phi_arg_def_from_edge (phi, loop_latch_edge (loop))
+ == gimple_assign_lhs (ass))
+ && (CONSTANT_CLASS_P (gimple_phi_arg_def_from_edge
+(phi, loop_preheader_edge (loop)
+   return true;
+   }
+   }
+  return false;
+}
   return true;
 }
 
-- 
2.35.3

Re: [PATCH] tree-optimization/110991 - unroll size estimate after vectorization

On Mon, 14 Aug 2023, Richard Biener wrote:

> The following testcase shows that we are bad at identifying inductions
> that will be optimized away after vectorizing them because SCEV doesn't
> handle vectorized defs.  The following rolls a simpler identification
> of SSA cycles covering a PHI and an assignment with a binary operator
> with a constant second operand.
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
> 
> Note, I also have a more general approach (will reply to this mail
> with an RFC).

So the following is an RFC, it replaces constant_after_peeling
with verifying all SSA operands are constants and then folding
the stmt, recording constant outcomes for further stmts becoming
constants.

We now traverse the loop body twice - once with the optimistic
constant initial values of IVs and after the first traversal
we drop these if the backedge value turns out non-constant.

We then use the outcomes from the second traversal for the size
estimate.

Now, we could use the sizes of the first traversal somehow
if we recorded them separately.  Maybe as followup.

I've again chickened out from doing the transform-with-value-numbering
approach, stopping when we hit a stmt copy limit.  The reason is
of course it's only reasonably simple if there's no branching in the
copied body (for example if we can resolve all branches during
unrolling).  Maybe we should really try harder here ...

I'm currently re-testing this (I made it less optimistic) and having
to fixup some fortran frontend -Warray-bound diagnostics (meh) as
we now unroll sth there.

Does this look better than trying to ad-hoc match the PHI "IV"s
that SCEV doesn't handle?

Thanks,
Richard.

>From 75bc2d108ebc23d513fa49664ffc6bcdb5559495 Mon Sep 17 00:00:00 2001
From: Richard Biener 
Date: Mon, 14 Aug 2023 12:02:41 +0200
Subject: [PATCH] test unroll
To: gcc-patches@gcc.gnu.org

---
 .../gcc.dg/fstack-protector-strong.c  |   4 +-
 gcc/tree-ssa-loop-ivcanon.cc  | 157 --
 2 files changed, 112 insertions(+), 49 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/fstack-protector-strong.c 
b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
index 94dc3508f1a..fafa1917449 100644
--- a/gcc/testsuite/gcc.dg/fstack-protector-strong.c
+++ b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
@@ -28,7 +28,7 @@ foo1 ()
 struct ArrayStruct
 {
   int a;
-  int array[10];
+  int array[18];
 };
 
 struct AA
@@ -43,7 +43,7 @@ foo2 ()
 {
   struct AA aa;
   int i;
-  for (i = 0; i < 10; ++i)
+  for (i = 0; i < 18; ++i)
 {
   aa.as.array[i] = i * (i-1) + i / 2;
 }
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index 99e50ee2efe..51543e43cbc 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -158,6 +158,7 @@ struct loop_size
   int num_branches_on_hot_path;
 };
 
+#if 0
 /* Return true if OP in STMT will be constant after peeling LOOP.  */
 
 static bool
@@ -245,6 +246,7 @@ constant_after_peeling (tree op, gimple *stmt, class loop 
*loop)
 }
   return true;
 }
+#endif
 
 /* Computes an estimated number of insns in LOOP.
EXIT (if non-NULL) is an exite edge that will be eliminated in all but last
@@ -276,6 +278,31 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge 
edge_to_cancel,
 
   if (dump_file && (dump_flags & TDF_DETAILS))
 fprintf (dump_file, "Estimating sizes for loop %i\n", loop->num);
+
+  static hash_map *vals;
+  vals = new hash_map;
+  edge pe = loop_preheader_edge (loop);
+  for (auto si = gsi_start_phis (loop->header);
+   !gsi_end_p (si); gsi_next (&si))
+{
+  if (virtual_operand_p (gimple_phi_result (*si)))
+   continue;
+  tree val = gimple_phi_arg_def_from_edge (*si, pe);
+  if (CONSTANT_CLASS_P (val))
+   {
+ vals->put (gimple_phi_result (*si), val);
+ tree ev = analyze_scalar_evolution (loop, gimple_phi_result (*si));
+ if (!chrec_contains_undetermined (ev)
+ && !chrec_contains_symbols (ev))
+   size->constant_iv = true;
+   }
+}
+
+  auto els_valueize = [] (tree op) -> tree
+{ if (tree *val = vals->get (op)) return *val; return op; };
+
+  auto process_loop = [&] () -> bool
+{
   for (i = 0; i < loop->num_nodes; i++)
 {
   if (edge_to_cancel && body[i] != edge_to_cancel->src
@@ -322,54 +349,47 @@ tree_estimate_loop_size (class loop *loop, edge exit, 
edge edge_to_cancel,
 "in last copy.\n");
  likely_eliminated_last = true;
}
- /* Sets of IV variables  */
- if (gimple_code (stmt) == GIMPLE_ASSIGN
- && constant_after_peeling (gimple_assign_lhs (stmt), stmt, 
loop))
+ /* Stores are not eliminated.  */
+ if (gimple_vdef (stmt))
+   continue;
+ /* Below we are using constant folding to decide whether
+we can elide a stmt.  While for the first iteration we
+

Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization

On Mon, 14 Aug 2023, ??? wrote:

> Thanks Kewen.
> 
> But I saw there is 2 more files include:
> 
> +#include "memmodel.h"
> +#include "optabs.h"
> 
> Not sure whether Richard and Richi ok with that change ?

Yes, please just apply some common sense.

> Thanks.
> 
> 
> 
> juzhe.zh...@rivai.ai
>  
> From: Kewen.Lin
> Date: 2023-08-14 20:45
> To: juzhe.zh...@rivai.ai
> CC: Robin Dapp; richard.sandiford; rguenther; GCC Patches
> Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST 
> vectorization
> Hi Juzhe,
>  
> on 2023/8/14 20:08, juzhe.zh...@rivai.ai wrote:
> > Hi, Kewin.
> > 
> > Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass 
> > the testing?
>  
> The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing 
> to the
> previous v4, the only changes should be the proposed 
> can_vec_extract_var_idx_p and
> its required new includes as below:
>  
> +#include "memmodel.h"
> +#include "optabs.h"
> Could you have a double check?
>  
> Since I just tested it on Power10, you have the full ownership on the patch, 
> I'd leave
> the v5 posting to you.  Thanks!
>  
> BR,
> Kewen
> -
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bc3063c3615..5ae9f69c7eb 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
> #include "tree-pass.h"
> #include "ssa.h"
> #include "optabs-tree.h"
> +#include "memmodel.h"
> +#include "optabs.h"
> #include "diagnostic-core.h"
> #include "fold-const.h"
> #include "stor-layout.h"
> @@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>/* No transformation required.  */
>if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> {
> -   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -OPTIMIZE_FOR_SPEED))
> - {
> -   if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "can't operate on partial vectors "
> - "because the target doesn't support extract "
> - "last reduction.\n");
> -   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> - }
> -   else if (slp_node)
> +   if (slp_node)
> {
>   if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>   else
> {
>   gcc_assert (ncopies == 1 && !slp_node);
> -   vect_record_loop_mask (loop_vinfo,
> -  &LOOP_VINFO_MASKS (loop_vinfo),
> -  1, vectype, NULL);
> +   if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +   OPTIMIZE_FOR_SPEED))
> + vect_record_loop_mask (loop_vinfo,
> +&LOOP_VINFO_MASKS (loop_vinfo),
> +1, vectype, NULL);
> +   else if (can_vec_extract_var_idx_p (
> + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype
> + vect_record_loop_len (loop_vinfo,
> +   &LOOP_VINFO_LENS (loop_vinfo),
> +   1, vectype, 1);
> +   else
> + {
> +   if (dump_enabled_p ())
> + dump_printf_loc (
> +   MSG_MISSED_OPTIMIZATION, vect_location,
> +   "can't operate on partial vectors "
> +   "because the target doesn't support extract "
> +   "last reduction.\n");
> +   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + }
> }
> }
>/* ???  Enable for loop costing as well.  */
> @@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>gimple *vec_stmt;
>if (slp_node)
>  {
> -  gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +  gcc_assert (!loop_vinfo
> +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +   && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>/* Get the correct slp vectorized stmt.  */
>vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>  
>gimple_seq stmts = NULL;
>tree new_tree;
> -  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> +   /* Emit:
> +
> +SCALAR_RES = VEC_EXTRACT 
> +
> +  where VEC_LHS is the vectorized live-out result and MASK is
> +  the loop mask for the final iteration.  */
> +   gcc_assert (ncopies == 1 && !slp_node);
> +   gimple_seq tem = NULL;
> +   gimple_stmt_iterator gsi = gsi_last (tem);
> +   tree len
> + = vect_get_loop_len (loop_vinfo, &gsi,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 0, 0);
> +
> +   /* BIAS - 1.  */
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   tree bias_minus_one
> + = int_const_binop (MINUS_EXPR,
> +build_int_cst (TREE_TYPE (len), biasval),
> +build_one_cst (TREE_TYPE (len)));
> +
> +   /* LAST_INDEX = LEN + (BIAS - 1).  */
> +   tree last_index

Re: [PATCH] vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest

On Mon, Aug 14, 2023 at 2:49 PM Kewen.Lin  wrote:
>
> Hi Richi,
>
> on 2023/8/14 20:04, Richard Biener wrote:
> > On Mon, Aug 14, 2023 at 10:54 AM Kewen.Lin  wrote:
> >>
> >> Hi,
> >>
> >> Following Richi's suggestion [1], this patch is to move the
> >> handlings on VMAT_LOAD_STORE_LANES in the final loop nest
> >> of function vectorizable_load to its own loop.  Basically
> >> it duplicates the final loop nest, clean up some useless
> >> set up code for the case of VMAT_LOAD_STORE_LANES, remove
> >> some unreachable code.  Also remove the corresponding
> >> handlings in the final loop nest.
> >>
> >> Bootstrapped and regtested on x86_64-redhat-linux,
> >> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
> >
> > OK (I guess the big diff is mostly because of re-indenting).
>
> Thanks!  Yes, there is some code in the original final loop nest like
>
> if (memory_access_type == VMAT_LOAD_STORE_LANES)
>   {
> ...
>   }
> else
>   {
> ...
>   }
>
> Then the else arm is fully re-indented.
>
> The other patch on VMAT_GATHER_SCATTER looks a bit better since
> it doesn't need re-indenting.

Yes, that's also because VMAT_LOAD_STORE_LANES isn't for SLP so
it even makes more sense to split that case out.

Richard.

> BR,
> Kewen
>
> >
> > Thanks,
> > Richard.
> >
> >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
> >>
> >> gcc/ChangeLog:
> >>
> >> * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
> >> VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
> >> and update the final nest accordingly.
> >> ---
> >>  gcc/tree-vect-stmts.cc | 1275 
> >>  1 file changed, 634 insertions(+), 641 deletions(-)
> >>
> >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> >> index 4f2d088484c..c361e16cb7b 100644
> >> --- a/gcc/tree-vect-stmts.cc
> >> +++ b/gcc/tree-vect-stmts.cc
> >> @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo,
> >> vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
> >>&vec_masks, mask_vectype);
> >>  }
> >> +
> >>tree vec_mask = NULL_TREE;
> >> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
> >> +{
> >> +  gcc_assert (alignment_support_scheme == dr_aligned
> >> + || alignment_support_scheme == dr_unaligned_supported);
> >> +  gcc_assert (grouped_load && !slp);
> >> +
> >> +  unsigned int inside_cost = 0, prologue_cost = 0;
> >> +  for (j = 0; j < ncopies; j++)
> >> +   {
> >> + if (costing_p)
> >> +   {
> >> + /* An IFN_LOAD_LANES will load all its vector results,
> >> +regardless of which ones we actually need.  Account
> >> +for the cost of unused results.  */
> >> + if (first_stmt_info == stmt_info)
> >> +   {
> >> + unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
> >> + stmt_vec_info next_stmt_info = first_stmt_info;
> >> + do
> >> +   {
> >> + gaps -= 1;
> >> + next_stmt_info = DR_GROUP_NEXT_ELEMENT 
> >> (next_stmt_info);
> >> +   }
> >> + while (next_stmt_info);
> >> + if (gaps)
> >> +   {
> >> + if (dump_enabled_p ())
> >> +   dump_printf_loc (MSG_NOTE, vect_location,
> >> +"vect_model_load_cost: %d "
> >> +"unused vectors.\n",
> >> +gaps);
> >> + vect_get_load_cost (vinfo, stmt_info, gaps,
> >> + alignment_support_scheme,
> >> + misalignment, false, 
> >> &inside_cost,
> >> + &prologue_cost, cost_vec, 
> >> cost_vec,
> >> + true);
> >> +   }
> >> +   }
> >> + vect_get_load_cost (vinfo, stmt_info, 1, 
> >> alignment_support_scheme,
> >> + misalignment, false, &inside_cost,
> >> + &prologue_cost, cost_vec, cost_vec, 
> >> true);
> >> + continue;
> >> +   }
> >> +
> >> + /* 1. Create the vector or array pointer update chain.  */
> >> + if (j == 0)
> >> +   dataref_ptr
> >> + = vect_create_data_ref_ptr (vinfo, first_stmt_info, 
> >> aggr_type,
> >> + at_loop, offset, &dummy, gsi,
> >> + &ptr_incr, false, bump);
> >> + else
> >> +   {
> >> + gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> >> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, 
> >> gsi,
> >> +

Re: [PATCH v1] RISC-V: Support RVV VFREC7 rounding mode intrinsic API

2023-08-14 Thread Kito Cheng via Gcc-patches

> +template

You don't need a template class here since it can only be UNSPEC_VFREC7.

> +class vfrec7_frm : public function_base
> +{
> +public:
> +  bool has_rounding_mode_operand_p () const override { return true; }
> +
> +  rtx expand (function_expander &e) const override
> +  {
> +return e.use_exact_insn (code_for_pred (UNSPEC, e.vector_mode ()));
> +  }
> +};
> +
> /* Implements vrsub.  */
> class vrsub : public function_base
> {
> @@ -2433,6 +2448,7 @@ static CONSTEXPR const unop vfsqrt_obj;
> static CONSTEXPR const unop_frm vfsqrt_frm_obj;
> static CONSTEXPR const float_misc vfrsqrt7_obj;
> static CONSTEXPR const float_misc vfrec7_obj;
> +static CONSTEXPR const vfrec7_frm vfrec7_frm_obj;

Then `static CONSTEXPR const vfrec7_frm vfrec7_frm_obj;` here

> static CONSTEXPR const binop vfmin_obj;
> static CONSTEXPR const binop vfmax_obj;
> static CONSTEXPR const float_misc vfsgnj_obj;

Re: [PATCH v2 0/3] RISC-V: Support ZC* extensions.

2023-08-14 Thread Kito Cheng via Gcc-patches

Pushed to the trunk, with slight updates like rename and update testcases :)


On Wed, Jun 7, 2023 at 10:28 PM Kito Cheng via Gcc-patches
 wrote:
>
> Thanks Jiawei, v2 patch set are LGTM, but I would like to defer this until
> binutils part has merged, I know you guys already implement that for a
> while, so I think it’s almost there :)
>
> Jiawei 於 2023年6月7日 週三，20:57寫道：
>
> > RISC-V Code Size Reduction(ZC*) extensions is a group of extensions
> > which define subsets of the existing C extension (Zca, Zcd, Zcf) and new
> > extensions(Zcb, Zcmp, Zcmt) which only contain 16-bit encodings.[1]
> >
> > The implementation of the RISC-V Code Size Reduction extension in GCC is
> > an important step towards making the RISC-V architecture more efficient.
> >
> > The cooperation with OpenHW group has played a crucial role in this effort,
> > with facilitating the implementation, testing and validation. Currently
> > works can also find in OpenHW group's github repo.[2]
> >
> > Thanks to Tariq Kurd, Ibrahim Abu Kharmeh for help with explain the
> > specification, and Jeremy Bennett's patient guidance throughout the whole
> > development process.a
> >
> > V2 changes:
> > Fix Kito's comments in first version, Eswin assisted in optimizing the
> > implementation of Zcmp extension:
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617440.html
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617442.html
> >
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-June/620869.html
> >
> >
> > [1] github.com/riscv/riscv-code-size-reduction/tree/main/Zc-specification
> >
> > [2] github.com/openhwgroup/corev-gcc
> >
> > Co-Authored by: Charlie Keaney 
> > Co-Authored by: Mary Bennett 
> > Co-Authored by: Nandni Jamnadas 
> > Co-Authored by: Sinan Lin 
> > Co-Authored by: Simon Cook 
> > Co-Authored by: Shihua Liao 
> > Co-Authored by: Yulong Shi 
> >
> >   RISC-V: Minimal support for ZC extensions.
> >   RISC-V: Enable compressible features when use ZC* extensions.
> >   RISC-V: Add ZC* test for march args being passed.
> >
> >
> > Jiawei (3):
> >   RISC-V: Minimal support for ZC* extensions.
> >   RISC-V: Enable compressible features when use ZC* extensions.
> >   RISC-V: Add ZC* test for failed march args being passed.
> >
> >  gcc/common/config/riscv/riscv-common.cc   | 38 +++
> >  gcc/config/riscv/riscv-c.cc   |  2 +-
> >  gcc/config/riscv/riscv-opts.h | 16 ++
> >  gcc/config/riscv/riscv-shorten-memrefs.cc |  3 +-
> >  gcc/config/riscv/riscv.cc | 11 ---
> >  gcc/config/riscv/riscv.h  |  2 +-
> >  gcc/config/riscv/riscv.opt|  3 ++
> >  gcc/testsuite/gcc.target/riscv/arch-22.c  |  5 +++
> >  gcc/testsuite/gcc.target/riscv/arch-23.c  |  5 +++
> >  9 files changed, 78 insertions(+), 7 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-22.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-23.c
> >
> > --
> > 2.25.1
> >
> >

[PATCH][GCC] aarch64: Add support for Cortex-A720 CPU

2023-08-14 Thread Richard Ball via Gcc-patches

This patch adds support for the Cortex-A720 CPU to GCC.

No regressions on aarch64-none-elf.

Ok for master?

gcc/ChangeLog:

* config/aarch64/aarch64-cores.def (AARCH64_CORE): Add Cortex-
A720 CPU.
* config/aarch64/aarch64-tune.md: Regenerate.
* doc/invoke.texi: Document Cortex-A720 CPU.
diff --git a/gcc/config/aarch64/aarch64-cores.def 
b/gcc/config/aarch64/aarch64-cores.def
index 
dbac497ef3aab410eb81db185b2e9532186888bb..5369dd3dd0fe695a371261547c76f034c29b9bcd
 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -176,6 +176,8 @@ AARCH64_CORE("cortex-a710",  cortexa710, cortexa57, V9A,  
(SVE2_BITPERM, MEMTAG,
 
 AARCH64_CORE("cortex-a715",  cortexa715, cortexa57, V9A,  (SVE2_BITPERM, 
MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd4d, -1)
 
+AARCH64_CORE("cortex-a720",  cortexa720, cortexa57, V9_2A,  (SVE2_BITPERM, 
MEMTAG), neoversen2, 0x41, 0xd81, -1)
+
 AARCH64_CORE("cortex-x2",  cortexx2, cortexa57, V9A,  (SVE2_BITPERM, MEMTAG, 
I8MM, BF16), neoversen2, 0x41, 0xd48, -1)
 
 AARCH64_CORE("cortex-x3",  cortexx3, cortexa57, V9A,  (SVE2_BITPERM, MEMTAG, 
I8MM, BF16), neoversen2, 0x41, 0xd4e, -1)
diff --git a/gcc/config/aarch64/aarch64-tune.md 
b/gcc/config/aarch64/aarch64-tune.md
index 
2170980dddb0d5d410a49631ad26ff2e346b39dd..12d610f0f6580096eed9cf3de8ad3239efde5e4b
 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexx2,cortexx3,neoversen2,demeter,neoversev2"
+   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,neoversen2,demeter,neoversev2"
(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 
2c870d3c34b587ffc721b1f18f99ecd66d4217be..62537d9d09e25f864c27534b7ac2ec467ea24789
 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -20517,7 +20517,8 @@ performance of the code.  Permissible values for this 
option are:
 @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55},
 @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2},
 @samp{cortex-x3}, @samp{cortex-a510}, @samp{cortex-a520}, @samp{cortex-a710},
-@samp{cortex-a715}, @samp{ampere1}, @samp{ampere1a}, and @samp{native}.
+@samp{cortex-a715}, @samp{cortex-a720}, @samp{ampere1}, @samp{ampere1a},
+and @samp{native}.
 
 The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
 @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},

Re: [PATCH] vect: Move VMAT_GATHER_SCATTER handlings from final loop nest

2023-08-14 Thread Richard Sandiford via Gcc-patches

"Kewen.Lin"  writes:
> Hi Richard,
>
> on 2023/8/14 20:20, Richard Sandiford wrote:
>> Thanks for the clean-ups.  But...
>> 
>> "Kewen.Lin"  writes:
>>> Hi,
>>>
>>> Following Richi's suggestion [1], this patch is to move the
>>> handlings on VMAT_GATHER_SCATTER in the final loop nest
>>> of function vectorizable_load to its own loop.  Basically
>>> it duplicates the final loop nest, clean up some useless
>>> set up code for the case of VMAT_GATHER_SCATTER, remove some
>>> unreachable code.  Also remove the corresponding handlings
>>> in the final loop nest.
>>>
>>> Bootstrapped and regtested on x86_64-redhat-linux,
>>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>>>
>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>>>
>>> Is it ok for trunk?
>>>
>>> BR,
>>> Kewen
>>> -
>>>
>>> gcc/ChangeLog:
>>>
>>> * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>>> VMAT_GATHER_SCATTER in the final loop nest to its own loop,
>>> and update the final nest accordingly.
>>> ---
>>>  gcc/tree-vect-stmts.cc | 361 +
>>>  1 file changed, 219 insertions(+), 142 deletions(-)
>> 
>> ...that seems like quite a lot of +s.  Is there nothing we can do to
>> avoid the cut-&-paste?
>
> Thanks for the comments!  I'm not sure if I get your question, if we
> want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem
> inevitable?  Your concern is mainly about git blame history?

No, it was more that 219-142=77, so it seems like a lot of lines
are being duplicated rather than simply being moved.  (Unlike for
VMAT_LOAD_STORE_LANES, which was even a slight LOC saving, and so
was a clear improvement.)

So I was just wondering if there was any obvious factoring-out that
could be done to reduce the duplication.

Thanks,
Richard

Re: [PATCH] x86: Update model values for Raptorlake.

2023-08-14 Thread Jonathan Wakely via Gcc-patches

On 14/08/23 04:37 +, Pan Li via Gcc-patches wrote:

Committed as obvious, and backported to GCC13.


Did you try building it on gcc-13?

case 0x97:
case 0x9a:
case 0xbf:
  /* Alder Lake.  */
case 0xb7:
case 0xba:
case 0xbf:
  /* Raptor Lake.  */


This fails:

In file included from /home/test/src/gcc-13/gcc/config/i386/driver-i386.cc:31:
/home/test/src/gcc-13/gcc/common/config/i386/cpuinfo.h: In function ‘const 
char* get_intel_cpu(__processor_model*, __processor_model2*, unsigned int*)’:
/home/test/src/gcc-13/gcc/common/config/i386/cpuinfo.h:543:5: error: duplicate 
case value
  543 | case 0xbf:
  | ^~~~
/home/test/src/gcc-13/gcc/common/config/i386/cpuinfo.h:539:5: note: previously 
used here
  539 | case 0xbf:
  | ^~~~

Please fix or revert.



Lili.


Update model values for Raptorlake according to SDM.

gcc/ChangeLog

* common/config/i386/cpuinfo.h (get_intel_cpu): Add model value 0xba
to Raptorlake.
---
gcc/common/config/i386/cpuinfo.h | 1 +
1 file changed, 1 insertion(+)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index ae48bc17771..dd7f0f6abfd 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -537,6 +537,7 @@ get_intel_cpu (struct __processor_model *cpu_model,
case 0x9a:
  /* Alder Lake.  */
case 0xb7:
+case 0xba:
case 0xbf:
  /* Raptor Lake.  */
case 0xaa:

Re: [PATCH] x86: Update model values for Raptorlake.

2023-08-14 Thread Jonathan Wakely via Gcc-patches

On 14/08/23 15:19 +0100, Jonathan Wakely wrote:

On 14/08/23 04:37 +, Pan Li via Gcc-patches wrote:

Committed as obvious, and backported to GCC13.


Did you try building it on gcc-13?

   case 0x97:
   case 0x9a:
   case 0xbf:
 /* Alder Lake.  */
   case 0xb7:
   case 0xba:
   case 0xbf:
 /* Raptor Lake.  */


This fails:

In file included from /home/test/src/gcc-13/gcc/config/i386/driver-i386.cc:31:
/home/test/src/gcc-13/gcc/common/config/i386/cpuinfo.h: In function ‘const 
char* get_intel_cpu(__processor_model*, __processor_model2*, unsigned int*)’:
/home/test/src/gcc-13/gcc/common/config/i386/cpuinfo.h:543:5: error: duplicate 
case value
 543 | case 0xbf:
 | ^~~~
/home/test/src/gcc-13/gcc/common/config/i386/cpuinfo.h:539:5: note: previously 
used here
 539 | case 0xbf:
 | ^~~~

Please fix or revert.



The backported patch is not the same as the trunk one, it adds two new
cases not one. But one of them is a duplicate of one you already added
in January 2022, in 4bd5297f665fd3ba5691297c016809f3501e7fba

No matter how obvious a patch is, if it touches code (not just
comments or docs) please don't commit without even building it once.

Also, backports should typically say something in the git commit
message, e.g. using git gcc-backport (or git cherry-pick -x) will
automatically add:

(cherry picked from commit 003016a40844701c48851020df672b70f3446bdb)

to the commit message.






Lili.


Update model values for Raptorlake according to SDM.

gcc/ChangeLog

* common/config/i386/cpuinfo.h (get_intel_cpu): Add model value 0xba
to Raptorlake.
---
gcc/common/config/i386/cpuinfo.h | 1 +
1 file changed, 1 insertion(+)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index ae48bc17771..dd7f0f6abfd 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -537,6 +537,7 @@ get_intel_cpu (struct __processor_model *cpu_model,
   case 0x9a:
 /* Alder Lake.  */
   case 0xb7:
+case 0xba:
   case 0xbf:
 /* Raptor Lake.  */
   case 0xaa:

[PATCH v2] RISC-V: Support RVV VFREC7 rounding mode intrinsic API

2023-08-14 Thread Pan Li via Gcc-patches

From: Pan Li 

Update in v2:

1. Remove the template of vfrec7 frm class.
2. Update the vfrec7_frm_obj declaration.

Original logs:

This patch would like to support the rounding mode API for the
VFREC7 as the below samples.

* __riscv_vfrec7_v_f32m1_rm
* __riscv_vfrec7_v_f32m1_rm_m

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class vfrec7_frm): New class for frm.
(vfrec7_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfrec7_frm): New intrinsic function definition.
* config/riscv/vector-iterators.md
(VFMISC): Remove VFREC7.
(misc_op): Ditto.
(float_insn_type): Ditto.
(VFMISC_FRM): New int iterator.
(misc_frm_op): New op for frm.
(float_frm_insn_type): New type for frm.
* config/riscv/vector.md (@pred_):
New pattern for misc frm.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-rec7.c: New test.
---
 .../riscv/riscv-vector-builtins-bases.cc  | 16 ++
 .../riscv/riscv-vector-builtins-bases.h   |  1 +
 .../riscv/riscv-vector-builtins-functions.def |  2 ++
 gcc/config/riscv/vector-iterators.md  | 12 +--
 gcc/config/riscv/vector.md| 23 ++
 .../riscv/rvv/base/float-point-rec7.c | 31 +++
 6 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-rec7.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 2074dac0f16..f2124080ef9 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -646,6 +646,20 @@ public:
   }
 };
 
+/* Implements below instructions for frm
+   - vfrec7
+*/
+class vfrec7_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  rtx expand (function_expander &e) const override
+  {
+return e.use_exact_insn (code_for_pred (UNSPEC_VFREC7, e.vector_mode ()));
+  }
+};
+
 /* Implements vrsub.  */
 class vrsub : public function_base
 {
@@ -2433,6 +2447,7 @@ static CONSTEXPR const unop vfsqrt_obj;
 static CONSTEXPR const unop_frm vfsqrt_frm_obj;
 static CONSTEXPR const float_misc vfrsqrt7_obj;
 static CONSTEXPR const float_misc vfrec7_obj;
+static CONSTEXPR const vfrec7_frm vfrec7_frm_obj;
 static CONSTEXPR const binop vfmin_obj;
 static CONSTEXPR const binop vfmax_obj;
 static CONSTEXPR const float_misc vfsgnj_obj;
@@ -2681,6 +2696,7 @@ BASE (vfsqrt)
 BASE (vfsqrt_frm)
 BASE (vfrsqrt7)
 BASE (vfrec7)
+BASE (vfrec7_frm)
 BASE (vfmin)
 BASE (vfmax)
 BASE (vfsgnj)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 5c91381bd4c..2a9381eec5e 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -187,6 +187,7 @@ extern const function_base *const vfsqrt;
 extern const function_base *const vfsqrt_frm;
 extern const function_base *const vfrsqrt7;
 extern const function_base *const vfrec7;
+extern const function_base *const vfrec7_frm;
 extern const function_base *const vfmin;
 extern const function_base *const vfmax;
 extern const function_base *const vfsgnj;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index a821aca6a4b..34def6bb82f 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -396,6 +396,8 @@ DEF_RVV_FUNCTION (vfrsqrt7, alu, full_preds, f_v_ops)
 // 13.10. Vector Floating-Point Reciprocal Estimate Instruction
 DEF_RVV_FUNCTION (vfrec7, alu, full_preds, f_v_ops)
 
+DEF_RVV_FUNCTION (vfrec7_frm, alu_frm, full_preds, f_v_ops)
+
 // 13.11. Vector Floating-Point MIN/MAX Instructions
 DEF_RVV_FUNCTION (vfmin, alu, full_preds, f_vvv_ops)
 DEF_RVV_FUNCTION (vfmin, alu, full_preds, f_vvf_ops)
diff --git a/gcc/config/riscv/vector-iterators.md 
b/gcc/config/riscv/vector-iterators.md
index 30808ceb241..9dd611e254b 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -1867,7 +1867,9 @@ (define_int_iterator VSAT_SHIFT_OP [UNSPEC_VSSRL 
UNSPEC_VSSRA])
 
 (define_int_iterator VMISC [UNSPEC_VMSBF UNSPEC_VMSIF UNSPEC_VMSOF])
 
-(define_int_iterator VFMISC [UNSPEC_VFRSQRT7 UNSPEC_VFREC7])
+(define_int_iterator VFMISC [UNSPEC_VFRSQRT7])
+
+(define_int_iterator VFMISC_FRM [UNSPEC_VFREC7])
 
 (define_int_iterator VFCVTS [UNSPEC_VFCVT UNSPEC_UNSIGNED_VFCVT])
 
@@ -1890,9 +1892,13 @@ (define_int_attr sat_insn_type [(UNSPEC_VAADDU "vaalu") 
(UNSPEC_VAADD "vaalu")
(UNSPEC_VNCLIPU "vnclip")])
 
 (define_int_attr misc_op [(UNSPEC_VMSBF "sbf") (UNSPEC_VMSIF "

RE: [PATCH v1] RISC-V: Support RVV VFREC7 rounding mode intrinsic API

Thanks Kito for comments, updated in PATCH v2.

https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627367.html

Pan

-Original Message-
From: Kito Cheng  
Sent: Monday, August 14, 2023 10:07 PM
To: 钟居哲 
Cc: Li, Pan2 ; gcc-patches ; Wang, 
Yanzhang 
Subject: Re: [PATCH v1] RISC-V: Support RVV VFREC7 rounding mode intrinsic API

> +template

You don't need a template class here since it can only be UNSPEC_VFREC7.

> +class vfrec7_frm : public function_base
> +{
> +public:
> +  bool has_rounding_mode_operand_p () const override { return true; }
> +
> +  rtx expand (function_expander &e) const override
> +  {
> +return e.use_exact_insn (code_for_pred (UNSPEC, e.vector_mode ()));
> +  }
> +};
> +
> /* Implements vrsub.  */
> class vrsub : public function_base
> {
> @@ -2433,6 +2448,7 @@ static CONSTEXPR const unop vfsqrt_obj;
> static CONSTEXPR const unop_frm vfsqrt_frm_obj;
> static CONSTEXPR const float_misc vfrsqrt7_obj;
> static CONSTEXPR const float_misc vfrec7_obj;
> +static CONSTEXPR const vfrec7_frm vfrec7_frm_obj;

Then `static CONSTEXPR const vfrec7_frm vfrec7_frm_obj;` here

> static CONSTEXPR const binop vfmin_obj;
> static CONSTEXPR const binop vfmax_obj;
> static CONSTEXPR const float_misc vfsgnj_obj;

Re: [PATCH][RFC] tree-optimization/92335 - Improve sinking heuristics for vectorization

2023-08-14 Thread Prathamesh Kulkarni via Gcc-patches

On Mon, 7 Aug 2023 at 13:19, Richard Biener  wrote:
>
> On Mon, Aug 7, 2023 at 2:05 AM Prathamesh Kulkarni via Gcc-patches
>  wrote:
> >
> > On Thu, 3 Aug 2023 at 17:48, Richard Biener  wrote:
> > >
> > > On Thu, 3 Aug 2023, Richard Biener wrote:
> > >
> > > > On Thu, 3 Aug 2023, Richard Biener wrote:
> > > >
> > > > > On Thu, 3 Aug 2023, Prathamesh Kulkarni wrote:
> > > > >
> > > > > > On Wed, 2 Aug 2023 at 14:17, Richard Biener via Gcc-patches
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Mon, 31 Jul 2023, Jeff Law wrote:
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > On 7/28/23 01:05, Richard Biener via Gcc-patches wrote:
> > > > > > > > > The following delays sinking of loads within the same 
> > > > > > > > > innermost
> > > > > > > > > loop when it was unconditional before.  That's a not uncommon
> > > > > > > > > issue preventing vectorization when masked loads are not 
> > > > > > > > > available.
> > > > > > > > >
> > > > > > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > > > > > > > >
> > > > > > > > > I have a followup patch improving sinking that without this 
> > > > > > > > > would
> > > > > > > > > cause more of the problematic sinking - now that we have a 
> > > > > > > > > second
> > > > > > > > > sink pass after loop opts this looks like a reasonable 
> > > > > > > > > approach?
> > > > > > > > >
> > > > > > > > > OK?
> > > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > > Richard.
> > > > > > > > >
> > > > > > > > >  PR tree-optimization/92335
> > > > > > > > >  * tree-ssa-sink.cc (select_best_block): Before loop
> > > > > > > > >  optimizations avoid sinking unconditional loads/stores
> > > > > > > > >  in innermost loops to conditional executed places.
> > > > > > > > >
> > > > > > > > >  * gcc.dg/tree-ssa/ssa-sink-10.c: Disable vectorizing.
> > > > > > > > >  * gcc.dg/tree-ssa/predcom-9.c: Clone from ssa-sink-10.c,
> > > > > > > > >  expect predictive commoning to happen instead of sinking.
> > > > > > > > >  * gcc.dg/vect/pr65947-3.c: Adjust.
> > > > > > > > I think it's reasonable -- there's probably going to be cases 
> > > > > > > > where it's not
> > > > > > > > great, but more often than not I think it's going to be a 
> > > > > > > > reasonable
> > > > > > > > heuristic.
> > > > > > > >
> > > > > > > > If there is undesirable fallout, better to find it over the 
> > > > > > > > coming months than
> > > > > > > > next spring.  So I'd suggest we go forward now to give more 
> > > > > > > > time to find any
> > > > > > > > pathological cases (if they exist).
> > > > > > >
> > > > > > > Agreed, I've pushed this now.
> > > > > > Hi Richard,
> > > > > > After this patch (committed in 
> > > > > > 399c8dd44ff44f4b496223c7cc980651c4d6f6a0),
> > > > > > pr65947-7.c "failed" for aarch64-linux-gnu:
> > > > > > FAIL: gcc.dg/vect/pr65947-7.c scan-tree-dump-not vect "LOOP 
> > > > > > VECTORIZED"
> > > > > > FAIL: gcc.dg/vect/pr65947-7.c -flto -ffat-lto-objects
> > > > > > scan-tree-dump-not vect "LOOP VECTORIZED"
> > > > > >
> > > > > > /* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" { 
> > > > > > target {
> > > > > > ! vect_fold_extract_last } } } } */
> > > > > >
> > > > > > With your commit, condition_reduction in pr65947-7.c gets vectorized
> > > > > > regardless of vect_fold_extract_last,
> > > > > > which gates the above test (which is an improvement, because the
> > > > > > function didn't get vectorized before the commit).
> > > > > >
> > > > > > The attached patch thus removes the gating on 
> > > > > > vect_fold_extract_last,
> > > > > > and the test passes again.
> > > > > > OK to commit ?
> > > > >
> > > > > OK.
> > > >
> > > > Or wait - the loop doesn't vectorize on x86_64, so I guess one
> > > > critical target condition is missing.  Can you figure out which?
> > >
> > > I see
> > >
> > > /space/rguenther/src/gcc/gcc/testsuite/gcc.dg/vect/pr65947-7.c:18:21:
> > > note:   vect_is_simple_use: operand last_19 = PHI ,
> > > type of def: reduction
> > > /space/rguenther/src/gcc/gcc/testsuite/gcc.dg/vect/pr65947-7.c:18:21:
> > > note:   vect_is_simple_use: vectype vector(4) int
> > > /space/rguenther/src/gcc/gcc/testsuite/gcc.dg/vect/pr65947-7.c:18:21:
> > > missed:   multiple types in double reduction or condition reduction or
> > > fold-left reduction.
> > > /space/rguenther/src/gcc/gcc/testsuite/gcc.dg/vect/pr65947-7.c:13:1:
> > > missed:   not vectorized: relevant phi not supported: last_19 = PHI
> > > 
> > > /space/rguenther/src/gcc/gcc/testsuite/gcc.dg/vect/pr65947-7.c:18:21:
> > > missed:  bad operation or unsupported loop bound.
> > Hi Richard,
> > Looking at the aarch64 vect dump, it seems the loop in
> > condition_reduction gets vectorized with V4HI mode
> > while fails for other modes in vectorizable_condition:
> >
> >   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
> >   && ncopies > 1)
> > {
> >   if (dump_enabled_p ())
> > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>

Re: [PATCH v1] RISC-V: Support RVV VFREC7 rounding mode intrinsic API

2023-08-14 Thread Kito Cheng via Gcc-patches

Checked with doc and llvm implementation, LGTM

Re: [PATCH v2 1/2] libstdc++: Implement more maintainable header

2023-08-14 Thread Arsen Arsenović via Gcc-patches

Arsen Arsenović  writes:

> This commit replaces the ad-hoc logic in  with an AutoGen
> database that (mostly) declaratively generates a version.h bit which
> combines all of the FTM logic across all headers together.
>
> This generated header defines macros of the form __glibcxx_foo,
> equivalent to their __cpp_lib_foo variants, according to rules specified
> in version.def and, optionally, if __glibcxx_want_foo or
> __glibcxx_want_all are defined, also defines __cpp_lib_foo forms with
> the same definition.
>
> libstdc++-v3/ChangeLog:
>
>   * include/Makefile.am (bits_freestanding): Add version.h.
>   (allcreated): Add version.h.
>   (${bits_srcdir}/version.h): New rule.  Regenerates
>   version.h out of version.{def,tpl}.
>   * include/Makefile.in: Regenerate.
>   * include/bits/version.def: New file.  Declares a list of
>   all feature test macros, their values and their preconditions.
>   * include/bits/version.tpl: New file.  Turns version.def
>   into a sequence of #if blocks.
>   * include/bits/version.h: New file.  Generated from
>   version.def.
>   * include/std/version: Replace with a __glibcxx_want_all define
>   and bits/version.h include.
> ---
> This patchset is a rebase of
> https://inbox.sourceware.org/libstdc++/20230429101640.1697750-1-ar...@aarsen.me/
>
> ... passing the same two checks (difall / vercmp) I wrote for the first
> pass.  Testsuite runs are still pending.
>
> Changes in this revision:
> - Replace the ${bits_srcdir}/version.h rule with a update-version phony,
> - Add the new __cpp_lib_chrono value,
> - Add __cpp_lib_{ranges_{contains,find_last,fold,iota}},
> - Add comments to various replaced conditions which summarize their
>   condition,
> - Correct a few minor errors spotted in review
>
> OK for trunk (if those testsuite runs end up clean)?

Same tests as the first time around passed on x86_64-pc-linux-gnu (that
is, regression testing + a large libstdc++ harness).
-- 
Arsen Arsenović


signature.asc
Description: PGP signature

[PATCH v2] analyzer: New option fanalyzer-show-events-in-system-headers [PR110543]

2023-08-14 Thread Benjamin Priour via Gcc-patches

From: benjamin priour 

Plenty useful, thanks David. I've adjusted some few things, especially
the artifacts of earlier versions I missed when building the commit.

I didn't how to test for warnings within , I couldn't figure a portable 
test.
I cannot pinpoint the line the warning is issued at in an inline DejaGNU 
directive,
nor can I safely say the stack depth if I check a multiline-output (nor the 
methods names)

In the end, I found out an alternative, I am checking for the presence of event 
"entry of 'main'".
Indeed, diagnostic_manager::finish_pruning comment's reads
If all we're left with is in one function, then filter function entry events.
The provided test case can only goes into main and std::* frames, so if "entry 
of 'main'" exists,
it means we are also going into std::* frames.

I've also adjusted the comment of prune_system_headers, analyzer.opt and added 
an entry to invoker.texi.

Successfully regstrapped off trunk
54be338589ea93ad4ff53d22adde476a0582537b on x86_64-linux-gnu.

Thanks,
Benjamin.

Patch below.


This patch introduces -fanalyzer-show-events-in-system-headers,
disabled by default.

This option reduces the noise of the analyzer emitted diagnostics
when dealing with system headers.
The new option only affects the display of the diagnostics,
but doesn't hinder the actual analysis.

Given a diagnostics path diving into a system header in the form
[
  prefix events...,
  system header call,
system header entry,
events within system headers...,
  system header return,
  suffix events...
]
then disabling the option (either by default or explicitly)
will shorten the path into:
[
  prefix events...,
  system header call,
  system header return,
  suffix events...
]

Signed-off-by: benjamin priour 

gcc/analyzer/ChangeLog:

PR analyzer/110543
* analyzer.opt: Add new option.
* diagnostic-manager.cc
(diagnostic_manager::prune_path): Call prune_system_headers.
(prune_frame): New function that deletes all events in a frame.
(diagnostic_manager::prune_system_headers): New function.
* diagnostic-manager.h: Add prune_system_headers declaration.

gcc/ChangeLog:

PR analyzer/110543
* doc/invoke.texi: Add documentation of
fanalyzer-show-events-in-system-headers

gcc/testsuite/ChangeLog:

PR analyzer/110543
* g++.dg/analyzer/fanalyzer-show-events-in-system-headers-default.C:
New test.
* g++.dg/analyzer/fanalyzer-show-events-in-system-headers-no.C:
New test.
* g++.dg/analyzer/fanalyzer-show-events-in-system-headers.C:
New test.
---
 gcc/analyzer/analyzer.opt |  4 +
 gcc/analyzer/diagnostic-manager.cc| 96 +++
 gcc/analyzer/diagnostic-manager.h |  1 +
 gcc/doc/invoke.texi   |  9 ++
 ...er-show-events-in-system-headers-default.C | 18 
 ...nalyzer-show-events-in-system-headers-no.C | 19 
 .../fanalyzer-show-events-in-system-headers.C | 14 +++
 7 files changed, 161 insertions(+)
 create mode 100644 
gcc/testsuite/g++.dg/analyzer/fanalyzer-show-events-in-system-headers-default.C
 create mode 100644 
gcc/testsuite/g++.dg/analyzer/fanalyzer-show-events-in-system-headers-no.C
 create mode 100644 
gcc/testsuite/g++.dg/analyzer/fanalyzer-show-events-in-system-headers.C

diff --git a/gcc/analyzer/analyzer.opt b/gcc/analyzer/analyzer.opt
index 2760aaa8151..7917473d122 100644
--- a/gcc/analyzer/analyzer.opt
+++ b/gcc/analyzer/analyzer.opt
@@ -290,6 +290,10 @@ fanalyzer-transitivity
 Common Var(flag_analyzer_transitivity) Init(0)
 Enable transitivity of constraints during analysis.
 
+fanalyzer-show-events-in-system-headers
+Common Var(flag_analyzer_show_events_in_system_headers) Init(0)
+Show events within system headers in analyzer execution paths.
+
 fanalyzer-call-summaries
 Common Var(flag_analyzer_call_summaries) Init(0)
 Approximate the effect of function calls to simplify analysis.
diff --git a/gcc/analyzer/diagnostic-manager.cc 
b/gcc/analyzer/diagnostic-manager.cc
index cfca305d552..430c4dc3d58 100644
--- a/gcc/analyzer/diagnostic-manager.cc
+++ b/gcc/analyzer/diagnostic-manager.cc
@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "system.h"
 #include "coretypes.h"
 #include "tree.h"
+#include "input.h"
 #include "pretty-print.h"
 #include "gcc-rich-location.h"
 #include "gimple-pretty-print.h"
@@ -2281,6 +2282,8 @@ diagnostic_manager::prune_path (checker_path *path,
   path->maybe_log (get_logger (), "path");
   prune_for_sm_diagnostic (path, sm, sval, state);
   prune_interproc_events (path);
+  if (! flag_analyzer_show_events_in_system_headers)
+prune_system_headers (path);
   consolidate_conditions (path);
   finish_pruning (path);
   path->maybe_log (get_logger (), "pruned");
@@ -2667,6 +2670,99 @@ diagnostic_manager::prune_interproc_events (checker_path 
*path) const
   while (changed);
 }
 
+/* Remove everything within [call p

Avoid division by zero in fold_loop_internal_call

2023-08-14 Thread Jan Hubicka via Gcc-patches

Hi,
My patch to fix profile after folding internal call is missing check for the
case profile was already zero before if-conversion.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

PR gcov-profile/110988
* tree-cfg.cc (fold_loop_internal_call): Avoid division by zero.

diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index ab1f8067c54..105f4b1c953 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -7734,11 +7734,14 @@ fold_loop_internal_call (gimple *g, tree value)
 test.  This should not happen as the guarded code should
 start with pre-header.  */
  gcc_assert (single_pred_edge (taken_edge->dest));
- taken_edge->dest->count
-   = taken_edge->dest->count.apply_scale (new_count,
-  old_count);
- scale_strictly_dominated_blocks (taken_edge->dest,
-  new_count, old_count);
+ if (old_count.nonzero_p ())
+   {
+ taken_edge->dest->count
+   = taken_edge->dest->count.apply_scale (new_count,
+  old_count);
+ scale_strictly_dominated_blocks (taken_edge->dest,
+  new_count, old_count);
+   }
}
}
 }

Re: [PATCH v2] analyzer: New option fanalyzer-show-events-in-system-headers [PR110543]

2023-08-14 Thread David Malcolm via Gcc-patches

On Mon, 2023-08-14 at 17:48 +0200, priour...@gmail.com wrote:
> From: benjamin priour 
> 
> Plenty useful, thanks David. I've adjusted some few things, especially
> the artifacts of earlier versions I missed when building the commit.
> 
> I didn't how to test for warnings within , I couldn't figure a 
> portable test.
> I cannot pinpoint the line the warning is issued at in an inline DejaGNU 
> directive,
> nor can I safely say the stack depth if I check a multiline-output (nor the 
> methods names)
> 
> In the end, I found out an alternative, I am checking for the presence of 
> event "entry of 'main'".
> Indeed, diagnostic_manager::finish_pruning comment's reads
> If all we're left with is in one function, then filter function entry events.
> The provided test case can only goes into main and std::* frames, so if 
> "entry of 'main'" exists,
> it means we are also going into std::* frames.
> 
> I've also adjusted the comment of prune_system_headers, analyzer.opt and 
> added an entry to invoker.texi.
> 
> Successfully regstrapped off trunk
> 54be338589ea93ad4ff53d22adde476a0582537b on x86_64-linux-gnu.

Thanks for the updated patch.

This is ready to push to trunk.

Dave

Re: [PATCH] tree-optimization/110991 - unroll size estimate after vectorization

2023-08-14 Thread Jan Hubicka via Gcc-patches

> The following testcase shows that we are bad at identifying inductions
> that will be optimized away after vectorizing them because SCEV doesn't
> handle vectorized defs.  The following rolls a simpler identification
> of SSA cycles covering a PHI and an assignment with a binary operator
> with a constant second operand.
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
> 
> Note, I also have a more general approach (will reply to this mail
> with an RFC).

Looks good to me.  This clearly be generalized to more complicated
expressions, so that is what you plan to do next?

Honza
> 
> Any comments on this particular change?
> 
>   PR tree-optimization/110991
>   * tree-ssa-loop-ivcanon.cc (constant_after_peeling): Handle
>   VIEW_CONVERT_EXPR , handle more simple IV-like SSA cycles
>   that will end up constant.
> 
>   * gcc.dg/tree-ssa/cunroll-16.c: New testcase.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c | 17 
>  gcc/tree-ssa-loop-ivcanon.cc   | 46 +-
>  2 files changed, 62 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c
> 
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c
> new file mode 100644
> index 000..9bb66ff8299
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-16.c
> @@ -0,0 +1,17 @@
> +/* PR/110991 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-cunroll-details -fdump-tree-optimized" } */
> +
> +static unsigned char a;
> +static signed char b;
> +void foo(void);
> +int main() {
> +  a = 25;
> +  for (; a > 13; --a)
> +b = a > 127 ?: a << 3;
> +  if (!b)
> +foo();
> +}
> +
> +/* { dg-final { scan-tree-dump "optimized: loop with \[0-9\]\+ iterations 
> completely unrolled" "cunroll" } } */
> +/* { dg-final { scan-tree-dump-not "foo" "optimized" } } */
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index a895e8e65be..99e50ee2efe 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -166,6 +166,11 @@ constant_after_peeling (tree op, gimple *stmt, class 
> loop *loop)
>if (CONSTANT_CLASS_P (op))
>  return true;
>  
> +  /* Get at the actual SSA operand.  */
> +  if (handled_component_p (op)
> +  && TREE_CODE (TREE_OPERAND (op, 0)) == SSA_NAME)
> +op = TREE_OPERAND (op, 0);
> +
>/* We can still fold accesses to constant arrays when index is known.  */
>if (TREE_CODE (op) != SSA_NAME)
>  {
> @@ -198,7 +203,46 @@ constant_after_peeling (tree op, gimple *stmt, class 
> loop *loop)
>tree ev = analyze_scalar_evolution (loop, op);
>if (chrec_contains_undetermined (ev)
>|| chrec_contains_symbols (ev))
> -return false;
> +{
> +  if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (op)))
> + {
> +   gassign *ass = nullptr;
> +   gphi *phi = nullptr;
> +   if (is_a  (SSA_NAME_DEF_STMT (op)))
> + {
> +   ass = as_a  (SSA_NAME_DEF_STMT (op));
> +   if (TREE_CODE (gimple_assign_rhs1 (ass)) == SSA_NAME)
> + phi = dyn_cast 
> + (SSA_NAME_DEF_STMT (gimple_assign_rhs1  (ass)));
> + }
> +   else if (is_a  (SSA_NAME_DEF_STMT (op)))
> + {
> +   phi = as_a  (SSA_NAME_DEF_STMT (op));
> +   if (gimple_bb (phi) == loop->header)
> + {
> +   tree def = gimple_phi_arg_def_from_edge
> + (phi, loop_latch_edge (loop));
> +   if (TREE_CODE (def) == SSA_NAME
> +   && is_a  (SSA_NAME_DEF_STMT (def)))
> + ass = as_a  (SSA_NAME_DEF_STMT (def));
> + }
> + }
> +   if (ass && phi)
> + {
> +   tree rhs1 = gimple_assign_rhs1 (ass);
> +   if (gimple_assign_rhs_class (ass) == GIMPLE_BINARY_RHS
> +   && CONSTANT_CLASS_P (gimple_assign_rhs2 (ass))
> +   && rhs1 == gimple_phi_result (phi)
> +   && gimple_bb (phi) == loop->header
> +   && (gimple_phi_arg_def_from_edge (phi, loop_latch_edge (loop))
> +   == gimple_assign_lhs (ass))
> +   && (CONSTANT_CLASS_P (gimple_phi_arg_def_from_edge
> +  (phi, loop_preheader_edge (loop)
> + return true;
> + }
> + }
> +  return false;
> +}
>return true;
>  }
>  
> -- 
> 2.35.3

Re: [PATCH] Fix for bug libstdc++/110860

2023-08-14 Thread Jonathan Wakely via Gcc-patches

On Mon, 14 Aug 2023 at 10:58, Paul Dreik via Libstdc++ <
libstd...@gcc.gnu.org> wrote:

> The patch below fixes an issue with the fix already committed for
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110860 which unfortunately
> was not sufficient for small magnitude floating point values.
>
> With the patch in place, the code now survives the fuzzing I used to
> find the problem in the first place. Tested on amd64.
>
> I prepared the patch using git show, which should include the signoff as
> instructed per the DCO.
>

I couldn't apply the patch directly from the email, but I'm not sure where
it got mangled. I just applied it by hand instead.

Pushed to trunk, thanks for the patch!

I'll push it to gcc-13 shortly too.




>
> Thanks, Paul
>
> 
> commit 848b8d948787495e64ed9c55d681eccf730b74fb
> Author: Paul Dreik 
> Date:   Mon Aug 14 11:52:30 2023 +0200
>
>  libstdc++: Avoid problematic use of log10 in std::format [PR110860]
>
>  If abs(__v) is smaller than one, the result will be on the
>  form 0.x. It is only if the magnitude is large that more digits
>  are needed before the decimal dot.
>
>  This uses frexp instead of log10 which should be less expensive
>  and have sufficient precision for the desired purpose.
>
>  It removes the problematic cases where log10 will be negative or not
>  fit in an int.
>
>  Signed-off-by: Paul Dreik 
>
> diff --git a/libstdc++-v3/include/std/format
> b/libstdc++-v3/include/std/format
> index f4520ff3f..729e3d4b9 100644
> --- a/libstdc++-v3/include/std/format
> +++ b/libstdc++-v3/include/std/format
> @@ -1490,14 +1490,22 @@ namespace __format
>   // If the buffer is too small it's probably because of a
> large
>   // precision, or a very large value in fixed format.
>   size_t __guess = 8 + __prec;
> - if (__fmt == chars_format::fixed && __v != 0) // +ddd.prec
> + if (__fmt == chars_format::fixed) // +ddd.prec
> {
> - if constexpr (is_same_v<_Fp, float>)
> -   __guess += __builtin_log10f(__v < 0.0f ? -__v : __v);
> - else if constexpr (is_same_v<_Fp, double>)
> -   __guess += __builtin_log10(__v < 0.0 ? -__v : __v);
> - else if constexpr (is_same_v<_Fp, long double>)
> -   __guess += __builtin_log10l(__v < 0.0l ? -__v : __v);
> + if constexpr (is_same_v<_Fp, float> || is_same_v<_Fp,
> double> ||
> is_same_v<_Fp, long double>)
> +   {
> + // the number of digits to the left of the decimal
> point
> + // is floor(log10(max(abs(__v),1)))+1
> + int __exp{};
> + if constexpr (is_same_v<_Fp, float>)
> +   __builtin_frexpf(__v, &__exp);
> + else if constexpr (is_same_v<_Fp, double>)
> +   __builtin_frexp(__v, &__exp);
> + else if constexpr (is_same_v<_Fp, long double>)
> +   __builtin_frexpl(__v, &__exp);
> + if (__exp>0)
> +   __guess += 1U + __exp * 4004U / 13301U; //
> log10(2) approx.
> +   }
>   else
> __guess += numeric_limits<_Fp>::max_exponent10;
> }
>

Re: [PATCH v4] Mode-Switching: Fix SET_SRC ICE for create_pre_exit

2023-08-14 Thread Jeff Law via Gcc-patches




On 8/12/23 18:56, pan2...@intel.com wrote:

From: Pan Li 

In same cases, like gcc/testsuite/gcc.dg/pr78148.c in RISC-V, there will
be only 1 operand when SET_SRC in create_pre_exit. For example as below.

(insn 13 9 14 2 (clobber (reg/i:TI 10 a0)) 
"gcc/testsuite/gcc.dg/pr78148.c":24:1 -1
   (expr_list:REG_UNUSED (reg/i:TI 10 a0)
 (nil)))

Unfortunately, SET_SRC requires at least 2 operands and then Segment
Fault here. For SH4 part result in Segment Fault, it looks like only
valid when the return_copy_pat is load or something like that. Thus,
this patch try to fix it by restrict the SET insn for SET_SRC.

Signed-off-by: Pan Li 

gcc/ChangeLog:

* mode-switching.cc (create_pre_exit): Add SET insn check.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/mode-switch-ice-1.c: New test.

OK.  Thanks for the updated version.

jeff

[pushed] c++: -fconcepts and __cpp_concepts

2023-08-14 Thread Jason Merrill via Gcc-patches

Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

Since -fconcepts no longer implies -fconcepts-ts, we shouldn't advertise TS
support with __cpp_concepts=201507L.  Also fix one case where -std=c++14
-fconcepts wasn't working (as found by range-v3 calendar).  Fixing other
cases is not a priority, probably better to reject that flag combination if
there are further issues.

gcc/c-family/ChangeLog:

* c-cppbuiltin.cc (c_cpp_builtins): Adjust __cpp_concepts.

gcc/cp/ChangeLog:

* parser.cc (cp_parser_simple_type_specifier): Handle -std=c++14
-fconcepts.
---
 gcc/c-family/c-cppbuiltin.cc | 2 +-
 gcc/cp/parser.cc | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/c-family/c-cppbuiltin.cc b/gcc/c-family/c-cppbuiltin.cc
index 6bd4c1261a7..f2b12fd63db 100644
--- a/gcc/c-family/c-cppbuiltin.cc
+++ b/gcc/c-family/c-cppbuiltin.cc
@@ -1089,7 +1089,7 @@ c_cpp_builtins (cpp_reader *pfile)
}
   if (flag_concepts)
 {
- if (cxx_dialect >= cxx20)
+ if (cxx_dialect >= cxx20 || !flag_concepts_ts)
cpp_define (pfile, "__cpp_concepts=202002L");
   else
 cpp_define (pfile, "__cpp_concepts=201507L");
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index 2d27376d988..7f646704d3f 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -20017,12 +20017,13 @@ cp_parser_simple_type_specifier (cp_parser* parser,
   /* Otherwise, look for a type-name.  */
   if (!type)
{
- if (cxx_dialect >= cxx17)
+ if (cxx_dialect >= cxx17 || flag_concepts)
cp_parser_parse_tentatively (parser);
 
  type = cp_parser_type_name (parser, (qualified_p && typename_p));
 
- if (cxx_dialect >= cxx17 && !cp_parser_parse_definitely (parser))
+ if ((cxx_dialect >= cxx17 || flag_concepts)
+ && !cp_parser_parse_definitely (parser))
type = NULL_TREE;
}
 

base-commit: 2d2b05f0691799f03062bf5c436462f14cad3e7c
-- 
2.39.3

[PATCH] Fortran: Avoid accessing gfc_charlen when not looking at BT_CHARACTER (PR 110677)

2023-08-14 Thread Martin Jambor

Hello,

this patch addresses an issue uncovered by the undefined behavior
sanitizer.  In function resolve_structure_cons in resolve.cc there is
a test starting with:

  if (cons->expr->ts.type == BT_CHARACTER && comp->ts.u.cl
  && comp->ts.u.cl->length
  && comp->ts.u.cl->length->expr_type == EXPR_CONSTANT

and UBSAN complained of loads from comp->ts.u.cl->length->expr_type of
integer value 1818451807 which is outside of the value range expr_t
enum.  If I understand the code correctly it the entire load was
unwanted because comp->ts.type in those cases is BT_CLASS and not
BT_CHARACTER.  This patch simply adds a check to make sure it is only
accessed in those cases.

I have verified that the UPBSAN failure goes away with this patch, it
also passes bootstrap and testing on x86_64-linux.  OK for master?

Thanks,

Martin



gcc/fortran/ChangeLog:

2023-08-14  Martin Jambor  

PR fortran/110677
* resolve.cc (resolve_structure_cons): Check comp->ts is character
type before accessing stuff through comp->ts.u.cl.
---
 gcc/fortran/resolve.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gcc/fortran/resolve.cc b/gcc/fortran/resolve.cc
index e7c8d919bef..5b4dfc5fcd2 100644
--- a/gcc/fortran/resolve.cc
+++ b/gcc/fortran/resolve.cc
@@ -1396,8 +1396,9 @@ resolve_structure_cons (gfc_expr *expr, int init)
 the one of the structure, ensure this if the lengths are known at
 compile time and when we are dealing with PARAMETER or structure
 constructors.  */
-  if (cons->expr->ts.type == BT_CHARACTER && comp->ts.u.cl
- && comp->ts.u.cl->length
+  if (cons->expr->ts.type == BT_CHARACTER
+ && comp->ts.type == BT_CHARACTER
+ && comp->ts.u.cl && comp->ts.u.cl->length
  && comp->ts.u.cl->length->expr_type == EXPR_CONSTANT
  && cons->expr->ts.u.cl && cons->expr->ts.u.cl->length
  && cons->expr->ts.u.cl->length->expr_type == EXPR_CONSTANT
-- 
2.41.0

Re: [PATCH v1] c++: follow DR 2386 and update implementation of get_tuple_size [PR110216]

2023-08-14 Thread Jason Merrill via Gcc-patches

On 8/12/23 04:16, gnaggnoyil wrote:

DR 2386 updated the tuple_size requirements for structured binding and
it now requires tuple_size to be considered only if
std::tuple_size names a complete class type with member value. GCC
before this patch does not follow the updated requrements, and this
patch is intended to implement it.

DR 2386
PR c++/110216

gcc/cp/ChangeLog:

* decl.cc (get_tuple_size): Update implemetation to follow DR 2386.

gcc/testsuite/ChangeLog:

* g++.dg/cpp1z/decomp10.C: Update expected error message for DR 2386.
* g++.dg/cpp1z/pr110216.C: New test.

Signed-off-by: gnaggnoyil

Pushed, thanks!

Note that the GCC DCO policy (https://gcc.gnu.org/dco.html) requires
real names in the sign-off; in this case I've applied the patch anyway
because it is small enough that it's not legally significant for copyright.

I think if you want to contribute larger patches under this pseudonym,
you should file a copyright assignment with the FSF, which explicitly
allows this. "If a contributor wants the FSF to publish only a
pseudonym, that is ok. The contributor should say this, and state the
desired pseudonym, when answering the request- form. The actual legal
papers will use the real name, but the FSF will publish only the
pseudonym."[2]

Thanks again,
Jason

[1]
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/Documentation/process?id=d4563201f33a022fc0353033d9dfeb1606a88330

[2] https://www.gnu.org/prep/maintain/maintain.html#Copyright-Papers

[PATCH] arm: [MVE intrinsics] fix binary_acca_int32 and binary_acca_int64 shapes

Fix these two shapes, where we were failing to check the last
non-predicate parameter.

2023-08-14  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (binary_acca_int32): Fix loop 
bound.
(binary_acca_int64): Likewise.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 6d477a84330..1633084608e 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -455,7 +455,7 @@ struct binary_acca_int32_def : public overloaded_base<0>
|| (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES)
   return error_mark_node;
 
-unsigned int last_arg = i;
+unsigned int last_arg = i + 1;
 for (i = 1; i < last_arg; i++)
   if (!r.require_matching_vector_type (i, type))
return error_mark_node;
@@ -492,7 +492,7 @@ struct binary_acca_int64_def : public overloaded_base<0>
|| (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES)
   return error_mark_node;
 
-unsigned int last_arg = i;
+unsigned int last_arg = i + 1;
 for (i = 1; i < last_arg; i++)
   if (!r.require_matching_vector_type (i, type))
return error_mark_node;
-- 
2.34.1

[PATCH] arm: [MVE intrinsics] Remove dead check for float type in parse_element_type

Fix a likely copy/paste error, where we check if ch == 'f' after we
checked it's either 's' or 'u'.

2023-08-14  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (parse_element_type):
Remove dead check.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 1633084608e..23eb9d0e69b 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -80,8 +80,7 @@ parse_element_type (const function_instance &instance, const 
char *&format)
 
   if (ch == 's' || ch == 'u')
 {
-  type_class_index tclass = (ch == 'f' ? TYPE_float
-: ch == 's' ? TYPE_signed
+  type_class_index tclass = (ch == 's' ? TYPE_signed
 : TYPE_unsigned);
   char *end;
   unsigned int bits = strtol (format, &end, 10);
-- 
2.34.1

Re: [PATCH v3] c++: extend cold, hot attributes to classes

2023-08-14 Thread Jason Merrill via Gcc-patches

On 8/11/23 09:18, Javier Martinez wrote:

Hi Jason,

Regarding the initialization example - no, the set of classes that we 
consider cold is more loosely defined.

On Thu, Aug 10, 2023 at 11:01 PM Jason Merrill > wrote:

 > Yes, but that's because the implicit op== isn't declared lazily like
 > some other special member functions (CLASSTYPE_LAZY_*/lazily_declare_fn)
 > which can happen after the class is complete.

I see, thanks. I have fixed this now by injecting it directly from 
lazily_declare_fn, works well. Doing it from grokclassfn instead seems 
to be a nuisance because the explicit method attribute might be 
processed after the class-propagated attribute is injected, which is the 
wrong way around for the desired precedence.

 > I think it would work to check for (flags & (ATTR_FLAG_FUNCTION_NEXT |
 > ATTR_FLAG_DECL_NEXT)) and return without warning in that case.  You'd
 > still set *no_add_attr.

Correct, done.

I have added the patch as an attachment, if it garbles it then I will 
use git-send-email next time.

That worked fine, thanks.

@@ -1110,6 +1110,28 @@ handle_hot_attribute (tree *node, tree name, tree 
ARG_UNUSED (args),
 {
   /* Attribute hot processing is done later with lookup_attribute.  */
 }
+  else if ((TREE_CODE (*node) == RECORD_TYPE
+   || TREE_CODE (*node) == UNION_TYPE)
+ && c_dialect_cxx ())

I think you also want to check for ATTR_FLAG_TYPE_IN_PLACE.

@@ -7866,6 +7891,10 @@ finish_struct (tree t, tree attributes)
   && !LAMBDA_TYPE_P (t))
 add_stmt (build_min (TAG_DEFN, t));

+  /* This must be done after all lazily declared special member functions

+ have been injected.  */
+  propagate_class_warmth_attribute (t);

Maybe call this in check_bases_and_members instead?

Jason

[PATCH 1/9] arm: [MVE intrinsics] factorize vmullbq vmulltq

Factorize vmullbq, vmulltq so that they use the same parameterized
names.

2023-08-14  Christophe Lyon  

gcc/
* config/arm/iterators.md (mve_insn): Add vmullb, vmullt.
(isu): Add VMULLBQ_INT_S, VMULLBQ_INT_U, VMULLTQ_INT_S,
VMULLTQ_INT_U.
(supf): Add VMULLBQ_POLY_P, VMULLTQ_POLY_P, VMULLBQ_POLY_M_P,
VMULLTQ_POLY_M_P.
(VMULLBQ_INT, VMULLTQ_INT, VMULLBQ_INT_M, VMULLTQ_INT_M): Delete.
(VMULLxQ_INT, VMULLxQ_POLY, VMULLxQ_INT_M, VMULLxQ_POLY_M): New.
* config/arm/mve.md (mve_vmullbq_int_)
(mve_vmulltq_int_): Merge into ...
(@mve_q_int_) ... this.
(mve_vmulltq_poly_p, mve_vmullbq_poly_p): Merge into ...
(@mve_q_poly_): ... this.
(mve_vmullbq_int_m_, mve_vmulltq_int_m_): Merge 
into ...
(@mve_q_int_m_): ... this.
(mve_vmullbq_poly_m_p, mve_vmulltq_poly_m_p): Merge into ...
(@mve_q_poly_m_): ... this.
---
 gcc/config/arm/iterators.md |  23 +++--
 gcc/config/arm/mve.md   | 100 
 2 files changed, 38 insertions(+), 85 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index b13ff53d36f..fb003bcd67b 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -917,6 +917,7 @@
 
 (define_int_attr mve_insn [
 (UNSPEC_VCADD90 "vcadd") (UNSPEC_VCADD270 "vcadd")
+(UNSPEC_VCMLA "vcmla") (UNSPEC_VCMLA90 "vcmla") 
(UNSPEC_VCMLA180 "vcmla") (UNSPEC_VCMLA270 "vcmla")
 (UNSPEC_VCMUL "vcmul") (UNSPEC_VCMUL90 "vcmul") 
(UNSPEC_VCMUL180 "vcmul") (UNSPEC_VCMUL270 "vcmul")
 (VABAVQ_P_S "vabav") (VABAVQ_P_U "vabav")
 (VABAVQ_S "vabav") (VABAVQ_U "vabav")
@@ -1044,6 +1045,13 @@
 (VMOVNTQ_S "vmovnt") (VMOVNTQ_U "vmovnt")
 (VMULHQ_M_S "vmulh") (VMULHQ_M_U "vmulh")
 (VMULHQ_S "vmulh") (VMULHQ_U "vmulh")
+(VMULLBQ_INT_M_S "vmullb") (VMULLBQ_INT_M_U "vmullb")
+(VMULLBQ_INT_S "vmullb") (VMULLBQ_INT_U "vmullb")
+(VMULLBQ_POLY_M_P "vmullb") (VMULLTQ_POLY_M_P "vmullt")
+(VMULLBQ_POLY_P "vmullb")
+(VMULLTQ_INT_M_S "vmullt") (VMULLTQ_INT_M_U "vmullt")
+(VMULLTQ_INT_S "vmullt") (VMULLTQ_INT_U "vmullt")
+(VMULLTQ_POLY_P "vmullt")
 (VMULQ_M_N_S "vmul") (VMULQ_M_N_U "vmul") (VMULQ_M_N_F "vmul")
 (VMULQ_M_S "vmul") (VMULQ_M_U "vmul") (VMULQ_M_F "vmul")
 (VMULQ_N_S "vmul") (VMULQ_N_U "vmul") (VMULQ_N_F "vmul")
@@ -1209,7 +1217,6 @@
 (VSUBQ_M_N_S "vsub") (VSUBQ_M_N_U "vsub") (VSUBQ_M_N_F "vsub")
 (VSUBQ_M_S "vsub") (VSUBQ_M_U "vsub") (VSUBQ_M_F "vsub")
 (VSUBQ_N_S "vsub") (VSUBQ_N_U "vsub") (VSUBQ_N_F "vsub")
-(UNSPEC_VCMLA "vcmla") (UNSPEC_VCMLA90 "vcmla") 
(UNSPEC_VCMLA180 "vcmla") (UNSPEC_VCMLA270 "vcmla")
 ])
 
 (define_int_attr isu[
@@ -1246,6 +1253,8 @@
 (VMOVNBQ_S "i") (VMOVNBQ_U "i")
 (VMOVNTQ_M_S "i") (VMOVNTQ_M_U "i")
 (VMOVNTQ_S "i") (VMOVNTQ_U "i")
+(VMULLBQ_INT_S "s") (VMULLBQ_INT_U "u")
+(VMULLTQ_INT_S "s") (VMULLTQ_INT_U "u")
 (VNEGQ_M_S "s")
 (VQABSQ_M_S "s")
 (VQMOVNBQ_M_S "s") (VQMOVNBQ_M_U "u")
@@ -2330,6 +2339,10 @@
   (VMLADAVQ_U "u") (VMULHQ_S "s") (VMULHQ_U "u")
   (VMULLBQ_INT_S "s") (VMULLBQ_INT_U "u") (VQADDQ_S "s")
   (VMULLTQ_INT_S "s") (VMULLTQ_INT_U "u") (VQADDQ_U "u")
+  (VMULLBQ_POLY_P "p")
+  (VMULLTQ_POLY_P "p")
+  (VMULLBQ_POLY_M_P "p")
+  (VMULLTQ_POLY_M_P "p")
   (VMULQ_N_S "s") (VMULQ_N_U "u") (VMULQ_S "s")
   (VMULQ_U "u")
   (VQADDQ_N_S "s") (VQADDQ_N_U "u")
@@ -2713,8 +2726,8 @@
 (define_int_iterator VMINVQ [VMINVQ_U VMINVQ_S])
 (define_int_iterator VMLADAVQ [VMLADAVQ_U VMLADAVQ_S])
 (define_int_iterator VMULHQ [VMULHQ_S VMULHQ_U])
-(define_int_iterator VMULLBQ_INT [VMULLBQ_INT_U VMULLBQ_INT_S])
-(define_int_iterator VMULLTQ_INT [VMULLTQ_INT_U VMULLTQ_INT_S])
+(define_int_iterator VMULLxQ_INT [VMULLBQ_INT_U VMULLBQ_INT_S VMULLTQ_INT_U 
VMULLTQ_INT_S])
+(define_int_iterator VMULLxQ_POLY [VMULLBQ_POLY_P VMULLTQ_POLY_P])
 (define_int_iterator VMULQ [VMULQ_U VMULQ_S])
 (define_int_iterator VMULQ_N [VMULQ_N_U VMULQ_N_S])
 (define_int_iterator VQADDQ [VQADDQ_U VQADDQ_S])
@@ -2815,7 +2828,8 @@
 (define_int_iterator VSLIQ_M_N [VSLIQ_M_N_U VSLIQ_M_N_S])
 (define_int_iterator VRSHLQ_M [VRSHLQ_M_S VRSHLQ_M_U])
 (define_int_iterator VMINQ_M [VMINQ_M_S VMINQ_M_U])
-(define_int_iterator VMULLBQ_INT_M [VMULLBQ_INT_M_U VMULLBQ_INT_M_S])
+(define_int_iterator VMULLxQ_INT_M [VMULLBQ_INT_M_U VMULLBQ_INT_M_S 
VMULLTQ_I

[PATCH 2/9] arm: [MVE intrinsics] add unspec_mve_function_exact_insn_vmull

Introduce a function that will be used to build vmull intrinsics with
the _int variant.

2023-08-14  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-functions.h (class
unspec_mve_function_exact_insn_vmull): New.
---
 gcc/config/arm/arm-mve-builtins-functions.h | 74 +
 1 file changed, 74 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-functions.h 
b/gcc/config/arm/arm-mve-builtins-functions.h
index a6573844319..c0fc450f886 100644
--- a/gcc/config/arm/arm-mve-builtins-functions.h
+++ b/gcc/config/arm/arm-mve-builtins-functions.h
@@ -838,6 +838,80 @@ public:
   }
 };
 
+
+/* Map the vmull-related function directly to CODE (UNSPEC, UNSPEC, M)
+   where M is the vector mode associated with type suffix 0.  We need
+   this special case because the builtins have _int in their
+   names.  */
+class unspec_mve_function_exact_insn_vmull : public function_base
+{
+public:
+  CONSTEXPR unspec_mve_function_exact_insn_vmull (int unspec_for_sint,
+ int unspec_for_uint,
+ int unspec_for_m_sint,
+ int unspec_for_m_uint)
+: m_unspec_for_sint (unspec_for_sint),
+  m_unspec_for_uint (unspec_for_uint),
+  m_unspec_for_m_sint (unspec_for_m_sint),
+  m_unspec_for_m_uint (unspec_for_m_uint)
+  {}
+
+  /* The unspec code associated with signed-integer and
+ unsigned-integer operations respectively.  It covers the cases
+ with and without the _m predicate.  */
+  int m_unspec_for_sint;
+  int m_unspec_for_uint;
+  int m_unspec_for_m_sint;
+  int m_unspec_for_m_uint;
+
+  rtx
+  expand (function_expander &e) const override
+  {
+insn_code code;
+
+if (! e.type_suffix (0).integer_p)
+  gcc_unreachable ();
+
+if (e.mode_suffix_id != MODE_none)
+  gcc_unreachable ();
+
+switch (e.pred)
+  {
+  case PRED_none:
+   /* No predicate, no suffix.  */
+   if (e.type_suffix (0).unsigned_p)
+ code = code_for_mve_q_int (m_unspec_for_uint, m_unspec_for_uint, 
e.vector_mode (0));
+   else
+ code = code_for_mve_q_int (m_unspec_for_sint, m_unspec_for_sint, 
e.vector_mode (0));
+
+   return e.use_exact_insn (code);
+
+  case PRED_m:
+   /* No suffix, "m" predicate.  */
+   if (e.type_suffix (0).unsigned_p)
+ code = code_for_mve_q_int_m (m_unspec_for_m_uint, 
m_unspec_for_m_uint, e.vector_mode (0));
+   else
+ code = code_for_mve_q_int_m (m_unspec_for_m_sint, 
m_unspec_for_m_sint, e.vector_mode (0));
+
+   return e.use_cond_insn (code, 0);
+
+  case PRED_x:
+   /* No suffix, "x" predicate.  */
+   if (e.type_suffix (0).unsigned_p)
+ code = code_for_mve_q_int_m (m_unspec_for_m_uint, 
m_unspec_for_m_uint, e.vector_mode (0));
+   else
+ code = code_for_mve_q_int_m (m_unspec_for_m_sint, 
m_unspec_for_m_sint, e.vector_mode (0));
+
+   return e.use_pred_x_insn (code);
+
+  default:
+   gcc_unreachable ();
+  }
+
+gcc_unreachable ();
+  }
+};
+
 } /* end namespace arm_mve */
 
 /* Declare the global function base NAME, creating it from an instance
-- 
2.34.1

[PATCH 5/9] arm: [MVE intrinsics] add support for p8 and p16 polynomial types

Although they look like aliases for u8 and u16, we need to define them
so that we can handle p8 and p16 suffixes with the general framework.

They will be used by vmull[bt]q_poly intrinsics.

2023-08-14  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins.cc (type_suffixes): Handle poly_p
field..
(TYPES_poly_8_16): New.
(poly_8_16): New.
* config/arm/arm-mve-builtins.def (p8): New type suffix.
(p16): Likewise.
* config/arm/arm-mve-builtins.h (enum type_class_index): Add
TYPE_poly.
(struct type_suffix_info): Add poly_p field.
---
 gcc/config/arm/arm-mve-builtins.cc  | 6 ++
 gcc/config/arm/arm-mve-builtins.def | 2 ++
 gcc/config/arm/arm-mve-builtins.h   | 5 -
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/gcc/config/arm/arm-mve-builtins.cc 
b/gcc/config/arm/arm-mve-builtins.cc
index 7eec9d2861c..fa8b0ad36b3 100644
--- a/gcc/config/arm/arm-mve-builtins.cc
+++ b/gcc/config/arm/arm-mve-builtins.cc
@@ -128,6 +128,7 @@ CONSTEXPR const type_suffix_info 
type_suffixes[NUM_TYPE_SUFFIXES + 1] = {
 TYPE_##CLASS == TYPE_signed || TYPE_##CLASS == TYPE_unsigned, \
 TYPE_##CLASS == TYPE_unsigned, \
 TYPE_##CLASS == TYPE_float, \
+TYPE_##CLASS == TYPE_poly, \
 0, \
 MODE },
 #include "arm-mve-builtins.def"
@@ -177,6 +178,10 @@ CONSTEXPR const type_suffix_info 
type_suffixes[NUM_TYPE_SUFFIXES + 1] = {
 #define TYPES_all_signed(S, D) \
   S (s8), S (s16), S (s32)
 
+/* _p8 _p16.  */
+#define TYPES_poly_8_16(S, D) \
+  S (p8), S (p16)
+
 /* _u8 _u16 _u32.  */
 #define TYPES_all_unsigned(S, D) \
   S (u8), S (u16), S (u32)
@@ -275,6 +280,7 @@ DEF_MVE_TYPES_ARRAY (integer_8);
 DEF_MVE_TYPES_ARRAY (integer_8_16);
 DEF_MVE_TYPES_ARRAY (integer_16_32);
 DEF_MVE_TYPES_ARRAY (integer_32);
+DEF_MVE_TYPES_ARRAY (poly_8_16);
 DEF_MVE_TYPES_ARRAY (signed_16_32);
 DEF_MVE_TYPES_ARRAY (signed_32);
 DEF_MVE_TYPES_ARRAY (reinterpret_integer);
diff --git a/gcc/config/arm/arm-mve-builtins.def 
b/gcc/config/arm/arm-mve-builtins.def
index e3f37876210..e2cf1baf370 100644
--- a/gcc/config/arm/arm-mve-builtins.def
+++ b/gcc/config/arm/arm-mve-builtins.def
@@ -63,6 +63,8 @@ DEF_MVE_TYPE_SUFFIX (u8, uint8x16_t, unsigned, 8, V16QImode)
 DEF_MVE_TYPE_SUFFIX (u16, uint16x8_t, unsigned, 16, V8HImode)
 DEF_MVE_TYPE_SUFFIX (u32, uint32x4_t, unsigned, 32, V4SImode)
 DEF_MVE_TYPE_SUFFIX (u64, uint64x2_t, unsigned, 64, V2DImode)
+DEF_MVE_TYPE_SUFFIX (p8, uint8x16_t, poly, 8, V16QImode)
+DEF_MVE_TYPE_SUFFIX (p16, uint16x8_t, poly, 16, V8HImode)
 #undef REQUIRES_FLOAT
 
 #define REQUIRES_FLOAT true
diff --git a/gcc/config/arm/arm-mve-builtins.h 
b/gcc/config/arm/arm-mve-builtins.h
index c9b51a0c77b..37b8223dfb2 100644
--- a/gcc/config/arm/arm-mve-builtins.h
+++ b/gcc/config/arm/arm-mve-builtins.h
@@ -146,6 +146,7 @@ enum type_class_index
   TYPE_float,
   TYPE_signed,
   TYPE_unsigned,
+  TYPE_poly,
   NUM_TYPE_CLASSES
 };
 
@@ -221,7 +222,9 @@ struct type_suffix_info
   unsigned int unsigned_p : 1;
   /* True if the suffix is for a floating-point type.  */
   unsigned int float_p : 1;
-  unsigned int spare : 13;
+  /* True if the suffix is for a polynomial type.  */
+  unsigned int poly_p : 1;
+  unsigned int spare : 12;
 
   /* The associated vector or predicate mode.  */
   machine_mode vector_mode : 16;
-- 
2.34.1

[PATCH 6/9] arm: [MVE intrinsics] add support for U and p formats in parse_element_type

Introduce these two format specifiers to define the shape of
vmull[bt]q_poly intrinsics.

'U' is used to define a double-width unsigned
'p' is used to define an element of 'poly' type.

2023-08-14  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (parse_element_type): Add
support for 'U' and 'p' format specifiers.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 16 
 1 file changed, 16 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index c8eb3351ef2..761da4d8ece 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -61,10 +61,12 @@ apply_predication (const function_instance &instance, tree 
return_type,
 
[01]- the element type in type suffix 0 or 1 of INSTANCE.
h  - a half-sized version of 
+   p  - a poly type with the same width as 
s - a signed type with the given number of bits
s[01]   - a signed type with the same width as type suffix 0 or 1
u - an unsigned type with the given number of bits
u[01]   - an unsigned type with the same width as type suffix 0 or 1
+   U  - an unsigned type with the double width as 
w  - a double-sized version of 
x - a type with the given number of bits and same signedness
  as the next argument.
@@ -102,6 +104,20 @@ parse_element_type (const function_instance &instance, 
const char *&format)
   type_suffixes[suffix].element_bits * 2);
 }
 
+   if (ch == 'U')
+{
+  type_suffix_index suffix = parse_element_type (instance, format);
+  return find_type_suffix (TYPE_unsigned,
+  type_suffixes[suffix].element_bits * 2);
+}
+
+   if (ch == 'p')
+{
+  type_suffix_index suffix = parse_element_type (instance, format);
+  return find_type_suffix (TYPE_poly,
+  type_suffixes[suffix].element_bits);
+}
+
   if (ch == 'x')
 {
   const char *next = format;
-- 
2.34.1

[PATCH 3/9] arm: [MVE intrinsics] add binary_widen shape

This patch adds the binary_widen shape description.

2023-08-14  Christophe Lyon  

gcc/:

* config/arm/arm-mve-builtins-shapes.cc (binary_widen): New.
* config/arm/arm-mve-builtins-shapes.h (binary_widen): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 42 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  5 +--
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 1f22201ac95..c8eb3351ef2 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1129,6 +1129,48 @@ struct binary_rshift_narrow_unsigned_def : public 
overloaded_base<0>
 };
 SHAPE (binary_rshift_narrow_unsigned)
 
+/* _t vfoo[_t0](_t, _t)
+
+   Example: vmullbq.
+   int32x4_t [__arm_]vmullbq_int[_s16](int16x8_t a, int16x8_t b)
+   int32x4_t [__arm_]vmullbq_int_m[_s16](int32x4_t inactive, int16x8_t a, 
int16x8_t b, mve_pred16_t p)
+   int32x4_t [__arm_]vmullbq_int_x[_s16](int16x8_t a, int16x8_t b, 
mve_pred16_t p)  */
+struct binary_widen_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "vw0,v0,v0", group, MODE_none, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (2, i, nargs)
+   || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+type_suffix_index wide_suffix
+  = find_type_suffix (type_suffixes[type].tclass,
+ type_suffixes[type].element_bits * 2);
+
+if (!r.require_matching_vector_type (i, type))
+  return error_mark_node;
+
+/* Check the inactive argument has the wide type.  */
+if ((r.pred == PRED_m)
+   && (r.infer_vector_type (0) != wide_suffix))
+  return r.report_no_such_form (type);
+
+return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (binary_widen)
+
 /* _t vfoo[_n_t0](_t, const int)
 
Check that 'imm' is in the [1..#bits] range.
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index a1842f5845c..fa6ec4fc002 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -35,13 +35,13 @@ namespace arm_mve
   {
 
 extern const function_shape *const binary;
-extern const function_shape *const binary_lshift;
-extern const function_shape *const binary_lshift_r;
 extern const function_shape *const binary_acc_int32;
 extern const function_shape *const binary_acc_int64;
 extern const function_shape *const binary_acca_int32;
 extern const function_shape *const binary_acca_int64;
 extern const function_shape *const binary_imm32;
+extern const function_shape *const binary_lshift;
+extern const function_shape *const binary_lshift_r;
 extern const function_shape *const binary_lshift_unsigned;
 extern const function_shape *const binary_maxamina;
 extern const function_shape *const binary_maxavminav;
@@ -54,6 +54,7 @@ namespace arm_mve
 extern const function_shape *const binary_rshift;
 extern const function_shape *const binary_rshift_narrow;
 extern const function_shape *const binary_rshift_narrow_unsigned;
+extern const function_shape *const binary_widen;
 extern const function_shape *const binary_widen_n;
 extern const function_shape *const binary_widen_opt_n;
 extern const function_shape *const cmp;
-- 
2.34.1

[PATCH 7/9] arm: [MVE intrinsics] add binary_widen_poly shape

This patch adds the binary_widen_poly shape description.

2023-08-14  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (binary_widen_poly): New.
* config/arm/arm-mve-builtins-shapes.h (binary_widen_poly): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 49 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 50 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 761da4d8ece..23eb9d0e69b 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1187,6 +1187,55 @@ struct binary_widen_def : public overloaded_base<0>
 };
 SHAPE (binary_widen)
 
+/* _t vfoo[_t0](_t, _t)
+
+   Example: vmullbq_poly.
+   uint32x4_t [__arm_]vmullbq_poly[_p16](uint16x8_t a, uint16x8_t b)
+   uint32x4_t [__arm_]vmullbq_poly_m[_p16](uint32x4_t inactive, uint16x8_t a, 
uint16x8_t b, mve_pred16_t p)
+   uint32x4_t [__arm_]vmullbq_poly_x[_p16](uint16x8_t a, uint16x8_t b, 
mve_pred16_t p)  */
+struct binary_widen_poly_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "vU0,vp0,vp0", group, MODE_none, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (2, i, nargs)
+   || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+/* infer_vector_type found the 'unsigned' version of the 'poly'
+   type we are looking for, so find the 'poly' type with the same
+   width.  */
+type = find_type_suffix (TYPE_poly, type_suffixes[type].element_bits);
+
+type_suffix_index wide_suffix
+  = find_type_suffix (TYPE_unsigned,
+ type_suffixes[type].element_bits * 2);
+
+/* Require the 'poly' type, require_matching_vector_type would try
+   and fail with the 'unsigned' one.  */
+if (!r.require_vector_type (i, type_suffixes[type].vector_type))
+  return error_mark_node;
+
+/* Check the inactive argument has the wide type.  */
+if ((r.pred == PRED_m)
+   && (r.infer_vector_type (0) != wide_suffix))
+  return r.report_no_such_form (type);
+
+return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (binary_widen_poly)
+
 /* _t vfoo[_n_t0](_t, const int)
 
Check that 'imm' is in the [1..#bits] range.
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index fa6ec4fc002..a93245321c9 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -57,6 +57,7 @@ namespace arm_mve
 extern const function_shape *const binary_widen;
 extern const function_shape *const binary_widen_n;
 extern const function_shape *const binary_widen_opt_n;
+extern const function_shape *const binary_widen_poly;
 extern const function_shape *const cmp;
 extern const function_shape *const create;
 extern const function_shape *const inherent;
-- 
2.34.1

[PATCH 8/9] arm: [MVE intrinsics] add unspec_mve_function_exact_insn_vmull_poly

Introduce a function that will be used to build vmull[bt]q_poly
intrinsics that use poly types.

2023-08-14  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-functions.h (class
unspec_mve_function_exact_insn_vmull_poly): New.
---
 gcc/config/arm/arm-mve-builtins-functions.h | 56 -
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/gcc/config/arm/arm-mve-builtins-functions.h 
b/gcc/config/arm/arm-mve-builtins-functions.h
index c0fc450f886..eba1f071af0 100644
--- a/gcc/config/arm/arm-mve-builtins-functions.h
+++ b/gcc/config/arm/arm-mve-builtins-functions.h
@@ -838,7 +838,6 @@ public:
   }
 };
 
-
 /* Map the vmull-related function directly to CODE (UNSPEC, UNSPEC, M)
where M is the vector mode associated with type suffix 0.  We need
this special case because the builtins have _int in their
@@ -912,6 +911,61 @@ public:
   }
 };
 
+/* Map the vmull_poly-related function directly to CODE (UNSPEC,
+   UNSPEC, M) where M is the vector mode associated with type suffix
+   0.  We need this special case because the builtins have _poly in
+   their names, and use the special poly type..  */
+class unspec_mve_function_exact_insn_vmull_poly : public function_base
+{
+public:
+  CONSTEXPR unspec_mve_function_exact_insn_vmull_poly (int unspec_for_poly,
+  int unspec_for_m_poly)
+: m_unspec_for_poly (unspec_for_poly),
+  m_unspec_for_m_poly (unspec_for_m_poly)
+  {}
+
+  /* The unspec code associated with signed-integer, unsigned-integer
+ and poly operations respectively.  It covers the cases with and
+ without the _m predicate.  */
+  int m_unspec_for_poly;
+  int m_unspec_for_m_poly;
+
+  rtx
+  expand (function_expander &e) const override
+  {
+insn_code code;
+
+if (e.mode_suffix_id != MODE_none)
+  gcc_unreachable ();
+
+if (! e.type_suffix (0).poly_p)
+  gcc_unreachable ();
+
+switch (e.pred)
+  {
+  case PRED_none:
+   /* No predicate, no suffix.  */
+   code = code_for_mve_q_poly (m_unspec_for_poly, m_unspec_for_poly, 
e.vector_mode (0));
+   return e.use_exact_insn (code);
+
+  case PRED_m:
+   /* No suffix, "m" predicate.  */
+   code = code_for_mve_q_poly_m (m_unspec_for_m_poly, m_unspec_for_m_poly, 
e.vector_mode (0));
+   return e.use_cond_insn (code, 0);
+
+  case PRED_x:
+   /* No suffix, "x" predicate.  */
+   code = code_for_mve_q_poly_m (m_unspec_for_m_poly, m_unspec_for_m_poly, 
e.vector_mode (0));
+   return e.use_pred_x_insn (code);
+
+  default:
+   gcc_unreachable ();
+  }
+
+gcc_unreachable ();
+  }
+};
+
 } /* end namespace arm_mve */
 
 /* Declare the global function base NAME, creating it from an instance
-- 
2.34.1

[PATCH 4/9] arm: [MVE intrinsics] rework vmullbq_int vmulltq_int

Implement vmullbq_int, vmulltq_int using the new MVE builtins
framework.

2023-08-14  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vmullbq_int, vmulltq_int):
New.
* config/arm/arm-mve-builtins-base.def (vmullbq_int, vmulltq_int):
New.
* config/arm/arm-mve-builtins-base.h (vmullbq_int, vmulltq_int):
New.
* config/arm/arm_mve.h (vmulltq_int): Remove.
(vmullbq_int): Remove.
(vmullbq_int_m): Remove.
(vmulltq_int_m): Remove.
(vmullbq_int_x): Remove.
(vmulltq_int_x): Remove.
(vmulltq_int_u8): Remove.
(vmullbq_int_u8): Remove.
(vmulltq_int_s8): Remove.
(vmullbq_int_s8): Remove.
(vmulltq_int_u16): Remove.
(vmullbq_int_u16): Remove.
(vmulltq_int_s16): Remove.
(vmullbq_int_s16): Remove.
(vmulltq_int_u32): Remove.
(vmullbq_int_u32): Remove.
(vmulltq_int_s32): Remove.
(vmullbq_int_s32): Remove.
(vmullbq_int_m_s8): Remove.
(vmullbq_int_m_s32): Remove.
(vmullbq_int_m_s16): Remove.
(vmullbq_int_m_u8): Remove.
(vmullbq_int_m_u32): Remove.
(vmullbq_int_m_u16): Remove.
(vmulltq_int_m_s8): Remove.
(vmulltq_int_m_s32): Remove.
(vmulltq_int_m_s16): Remove.
(vmulltq_int_m_u8): Remove.
(vmulltq_int_m_u32): Remove.
(vmulltq_int_m_u16): Remove.
(vmullbq_int_x_s8): Remove.
(vmullbq_int_x_s16): Remove.
(vmullbq_int_x_s32): Remove.
(vmullbq_int_x_u8): Remove.
(vmullbq_int_x_u16): Remove.
(vmullbq_int_x_u32): Remove.
(vmulltq_int_x_s8): Remove.
(vmulltq_int_x_s16): Remove.
(vmulltq_int_x_s32): Remove.
(vmulltq_int_x_u8): Remove.
(vmulltq_int_x_u16): Remove.
(vmulltq_int_x_u32): Remove.
(__arm_vmulltq_int_u8): Remove.
(__arm_vmullbq_int_u8): Remove.
(__arm_vmulltq_int_s8): Remove.
(__arm_vmullbq_int_s8): Remove.
(__arm_vmulltq_int_u16): Remove.
(__arm_vmullbq_int_u16): Remove.
(__arm_vmulltq_int_s16): Remove.
(__arm_vmullbq_int_s16): Remove.
(__arm_vmulltq_int_u32): Remove.
(__arm_vmullbq_int_u32): Remove.
(__arm_vmulltq_int_s32): Remove.
(__arm_vmullbq_int_s32): Remove.
(__arm_vmullbq_int_m_s8): Remove.
(__arm_vmullbq_int_m_s32): Remove.
(__arm_vmullbq_int_m_s16): Remove.
(__arm_vmullbq_int_m_u8): Remove.
(__arm_vmullbq_int_m_u32): Remove.
(__arm_vmullbq_int_m_u16): Remove.
(__arm_vmulltq_int_m_s8): Remove.
(__arm_vmulltq_int_m_s32): Remove.
(__arm_vmulltq_int_m_s16): Remove.
(__arm_vmulltq_int_m_u8): Remove.
(__arm_vmulltq_int_m_u32): Remove.
(__arm_vmulltq_int_m_u16): Remove.
(__arm_vmullbq_int_x_s8): Remove.
(__arm_vmullbq_int_x_s16): Remove.
(__arm_vmullbq_int_x_s32): Remove.
(__arm_vmullbq_int_x_u8): Remove.
(__arm_vmullbq_int_x_u16): Remove.
(__arm_vmullbq_int_x_u32): Remove.
(__arm_vmulltq_int_x_s8): Remove.
(__arm_vmulltq_int_x_s16): Remove.
(__arm_vmulltq_int_x_s32): Remove.
(__arm_vmulltq_int_x_u8): Remove.
(__arm_vmulltq_int_x_u16): Remove.
(__arm_vmulltq_int_x_u32): Remove.
(__arm_vmulltq_int): Remove.
(__arm_vmullbq_int): Remove.
(__arm_vmullbq_int_m): Remove.
(__arm_vmulltq_int_m): Remove.
(__arm_vmullbq_int_x): Remove.
(__arm_vmulltq_int_x): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   2 +
 gcc/config/arm/arm-mve-builtins-base.def |   2 +
 gcc/config/arm/arm-mve-builtins-base.h   |   2 +
 gcc/config/arm/arm_mve.h | 648 ---
 4 files changed, 6 insertions(+), 648 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index e31095ae112..3620c56865d 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -329,6 +329,8 @@ FUNCTION_WITHOUT_N_NO_F (vmovltq, VMOVLTQ)
 FUNCTION_WITHOUT_N_NO_F (vmovnbq, VMOVNBQ)
 FUNCTION_WITHOUT_N_NO_F (vmovntq, VMOVNTQ)
 FUNCTION_WITHOUT_N_NO_F (vmulhq, VMULHQ)
+FUNCTION (vmullbq_int, unspec_mve_function_exact_insn_vmull, (VMULLBQ_INT_S, 
VMULLBQ_INT_U, VMULLBQ_INT_M_S, VMULLBQ_INT_M_U))
+FUNCTION (vmulltq_int, unspec_mve_function_exact_insn_vmull, (VMULLTQ_INT_S, 
VMULLTQ_INT_U, VMULLTQ_INT_M_S, VMULLTQ_INT_M_U))
 FUNCTION_WITH_RTX_M_N (vmulq, MULT, VMULQ)
 FUNCTION_WITH_RTX_M_N_NO_F (vmvnq, NOT, VMVNQ)
 FUNCTION (vnegq, unspec_based_mve_function_exact_insn, (NEG, NEG, NEG, -1, -1, 
-1, VNEGQ_M_S, -1, VNEGQ_M_F, -1, -1, -1))
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index e7d466f2efd..db811bec479 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/

[PATCH 9/9] arm: [MVE intrinsics] rework vmullbq_poly vmulltq_poly