date:20241117

[PATCH] Write binary annotations for CodeView S_INLINESITE symbols

2024-11-17 Thread Mark Harmstone

Add "binary annotations" at the end of CodeView S_INLINESITE symbols,
which are a series of compressed integers that represent how line
numbers map to addresses.

This requires assembler support; you will need commit b3aa594d ("gas:
add .cv_ucomp and .cv_scomp pseudo-directives") in binutils.

gcc/
* dwarf2codeview.cc (enum binary_annotation_opcode): Define.
(struct codeview_function): Add htab_next and inline_loc;
(struct cv_func_hasher): Define.
(cv_func_htab): New global variable.
(new_codeview_function): Add new codeview_function to hash table.
(codeview_begin_block): Record location of inline block.
(codeview_end_block): Add dummy source line at end of inline block.
(find_line_function): New function.
(write_binary_annotations): New function.
(write_s_inlinesite): Call write_binary_annotations.
(codeview_debug_finish): Delete cv_func_htab.
---
(This goes against my patch "Write S_INLINESITE CodeView symbols". I
held off from submitting because I was waiting for the binutils patch to
go in, but I want to get this out there before the code freeze.)

 gcc/dwarf2codeview.cc | 195 +-
 1 file changed, 191 insertions(+), 4 deletions(-)

diff --git a/gcc/dwarf2codeview.cc b/gcc/dwarf2codeview.cc
index 4dd1bff6bc6..08fbe7f5bb6 100644
--- a/gcc/dwarf2codeview.cc
+++ b/gcc/dwarf2codeview.cc
@@ -1085,6 +1085,25 @@ enum cv_amd64_register {
   CV_AMD64_YMM15D3 = 687
 };
 
+/* This is enum BinaryAnnotationOpcode in Microsoft's cvinfo.h.  */
+
+enum binary_annotation_opcode {
+  ba_op_invalid,
+  ba_op_code_offset,
+  ba_op_change_code_offset_base,
+  ba_op_change_code_offset,
+  ba_op_change_code_length,
+  ba_op_change_file,
+  ba_op_change_line_offset,
+  ba_op_change_line_end_delta,
+  ba_op_change_range_kind,
+  ba_op_change_column_start,
+  ba_op_change_column_end_delta,
+  ba_op_change_code_offset_and_line_offset,
+  ba_op_change_code_length_and_code_offset,
+  ba_op_change_column_end
+};
+
 struct codeview_string
 {
   codeview_string *next;
@@ -1156,11 +1175,13 @@ struct codeview_inlinee_lines
 struct codeview_function
 {
   codeview_function *next;
+  codeview_function *htab_next;
   function *func;
   unsigned int end_label;
   codeview_line_block *blocks, *last_block;
   codeview_function *parent;
   unsigned int inline_block;
+  location_t inline_loc;
 };
 
 struct codeview_symbol
@@ -1439,6 +1460,16 @@ struct inlinee_lines_hasher : free_ptr_hash 
   }
 };
 
+struct cv_func_hasher : nofree_ptr_hash 
+{
+  typedef dw_die_ref compare_type;
+
+  static bool equal (const codeview_function *f, dw_die_ref die)
+  {
+return lookup_decl_die (f->func->decl) == die;
+  }
+};
+
 static unsigned int line_label_num;
 static unsigned int func_label_num;
 static unsigned int sym_label_num;
@@ -1456,6 +1487,7 @@ static codeview_custom_type *custom_types, 
*last_custom_type;
 static codeview_deferred_type *deferred_types, *last_deferred_type;
 static hash_table *string_id_htab;
 static hash_table *inlinee_lines_htab;
+static hash_table *cv_func_htab;
 
 static uint32_t get_type_num (dw_die_ref type, bool in_struct, bool 
no_fwd_ref);
 static uint32_t get_type_num_subroutine_type (dw_die_ref type, bool in_struct,
@@ -1504,14 +1536,18 @@ get_file_id (const char *filename)
 static codeview_function *
 new_codeview_function (void)
 {
+  codeview_function **slot;
+  dw_die_ref die;
   codeview_function *f = (codeview_function *)
xmalloc (sizeof (codeview_function));
 
   f->next = NULL;
+  f->htab_next = NULL;
   f->func = cfun;
   f->end_label = 0;
   f->blocks = f->last_block = NULL;
   f->inline_block = 0;
+  f->inline_loc = 0;
 
   if (!funcs)
 funcs = f;
@@ -1520,6 +1556,18 @@ new_codeview_function (void)
 
   last_func = f;
 
+  if (!cv_func_htab)
+cv_func_htab = new hash_table (10);
+
+  die = lookup_decl_die (cfun->decl);
+
+  slot = cv_func_htab->find_slot_with_hash (die, htab_hash_pointer (die),
+   INSERT);
+  if (*slot)
+f->htab_next = *slot;
+
+  *slot = f;
+
   return f;
 }
 
@@ -1598,6 +1646,7 @@ codeview_begin_block (unsigned int line ATTRIBUTE_UNUSED,
 
   f->parent = cur_func;
   f->inline_block = blocknum;
+  f->inline_loc = locus;
 
   cur_func = f;
 }
@@ -1610,7 +1659,13 @@ void
 codeview_end_block (unsigned int line ATTRIBUTE_UNUSED, unsigned int blocknum)
 {
   if (cur_func && cur_func->inline_block == blocknum)
-cur_func = cur_func->parent;
+{
+  /* If inlined function, add dummy source line at the end so we know how
+long the actual last line is.  */
+  codeview_source_line (0, "");
+
+  cur_func = cur_func->parent;
+}
 }
 
 /* Adds string to the string table, returning its offset.  If already present,
@@ -3321,6 +3376,133 @@ write_optimized_static_local_vars (dw_die_ref die)
   while (c != first_child);
 }
 
+/* Given a DW_TAG_inlined_subrou

Re: [PATCH 2/2] diagnostics: suppress "note: " prefix in nested diagnostics [PR116253]

2024-11-17 Thread David Malcolm

On Fri, 2024-11-15 at 20:02 -0500, David Malcolm wrote:
> This patch is a followup to:
>   "c++: use diagnostic nesting [PR116253]"
> 
> This patch tweaks how text output with experimental-nesting=yes
> prints nested diagnostics, by omitting the leading "note: " from
> nested notes.
> 
> This reduces the amount of visual cruft the user has to ignore when
> reading C++ template errors; see the examples in the testsuite.
> 
> This doesn't affect the output for users who have not opted-in
> to nested diagnostic-printing.
> 
> Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
> OK for trunk?

A screenshot showing the effect of these patches can be seen here:
  https://gcc.gnu.org/bugzilla/attachment.cgi?id=59611

As before, the patches currently require
 -fdiagnostics-set-output=text:experimental-nesting=yes 
to get the indented output.  I think this is big UX improvement, so I'm
hoping that perhaps it can be the default for GCC (if we can make this
output good enough; it only affects diagnsostics within an
auto_diagnostic_nesting_level instance). 

Dave

Re: [PATCH 2/2] diagnostics: suppress "note: " prefix in nested diagnostics [PR116253]

2024-11-17 Thread David Malcolm

On Sun, 2024-11-17 at 09:01 -0500, David Malcolm wrote:
> On Fri, 2024-11-15 at 20:02 -0500, David Malcolm wrote:
> > This patch is a followup to:
> >   "c++: use diagnostic nesting [PR116253]"
> > 
> > This patch tweaks how text output with experimental-nesting=yes
> > prints nested diagnostics, by omitting the leading "note: " from
> > nested notes.
> > 
> > This reduces the amount of visual cruft the user has to ignore when
> > reading C++ template errors; see the examples in the testsuite.
> > 
> > This doesn't affect the output for users who have not opted-in
> > to nested diagnostic-printing.
> > 
> > Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
> > OK for trunk?
> 
> A screenshot showing the effect of these patches can be seen here:
>   https://gcc.gnu.org/bugzilla/attachment.cgi?id=59611
> 
> As before, the patches currently require
>  -fdiagnostics-set-output=text:experimental-nesting=yes 
> to get the indented output.  I think this is big UX improvement, so
> I'm
> hoping that perhaps it can be the default for GCC

GCC 15, I meant to say.

>  (if we can make this
> output good enough; it only affects diagnsostics within an
> auto_diagnostic_nesting_level instance).

Mark asm statements as necessary in ipa-fnsummary

2024-11-17 Thread Jan Hubicka

Hi,
I forgot to mark asm statements as necessary in ipa-fnsummary. This should
mask failure of gcc.dg/guality/pr36728-2.c where the patch enabled
cloning which breaks debug info.

gcc/ChangeLog:

* ipa-fnsummary.cc (find_necessary_statements): ASM statements are
necessary.

diff --git a/gcc/ipa-fnsummary.cc b/gcc/ipa-fnsummary.cc
index 87e08dad846..e6bdc006969 100644
--- a/gcc/ipa-fnsummary.cc
+++ b/gcc/ipa-fnsummary.cc
@@ -2804,7 +2804,8 @@ find_necessary_statements (struct cgraph_node *node)
  || (is_ctrl_stmt (stmt)
  && (gimple_code (stmt) != GIMPLE_COND
  || !guards_builtin_unreachable (bb, cache)))
- || gimple_store_p (stmt))
+ || gimple_store_p (stmt)
+ || gimple_code (stmt) == GIMPLE_ASM)
mark_stmt_necessary (stmt, worklist);
}
 }

Re: [PATCH V2 9/11] Update tests to work with architecture flags changes.

2024-11-17 Thread Michael Meissner

On Thu, Nov 14, 2024 at 06:47:58PM -0600, Peter Bergner wrote:
> On 11/8/24 1:55 PM, Michael Meissner wrote:
> > Two tests used -mvsx to raise the processor level to at least power7.  These
> > tests were rewritten to add cpu=power7 support.
> 
> Again, this cleanup patch like the TARGET_ -> TARGET_ patches
> is independent of the main patches in this series (ie, patche 1-3),
> so let's pull this out of the series and just mention they're cleanup
> patches preparing for the actual 3 patch series to come later.

See my latest round of patches.

> 
> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
> >  /* { dg-require-effective-target powerpc_fprs } */
> >  /* { dg-options "-O2 -ffast-math -mdejagnu-cpu=power5 -mno-altivec 
> > -mabi=altivec -fno-unroll-loops" } */
> > -/* { dg-final { scan-assembler-times "vaddfp" 1 } } */
> > +/* { dg-final { scan-assembler-times "vaddfp" 2 } } */
> >  /* { dg-final { scan-assembler-times "xvaddsp" 1 } } */
> >  /* { dg-final { scan-assembler-times "fadds" 1 } } */
> >  
> > @@ -18,10 +18,6 @@
> >  #error "__VSX__ should not be defined."
> >  #endif
> >  
> > -#pragma GCC target("altivec,vsx")
> > -#include 
> > -#pragma GCC reset_options
> > -
> >  #pragma GCC push_options
> >  #pragma GCC target("altivec,no-vsx")
> 
> Is this illegal?  We're using -mcpu=power5, which should always flag
> an error if we use it with -maltivec or -mvsx.  Isn't that what's
> happening above (before your patch too) by the pragma adding -maltivec
> to the compile options?  Or does the pragma target through out all
> all of our dg-options?  If so, aren't we using the default -mcpu=
> values (Power4 for BE and Power8 for LE) which would seem ok on LE,
> but a problem on BE.

I'm not sure what the question is.  Without the patch as I said, if you use
-mvsx or #pragma GCC target("vsx"), it essentially sets the cpu to at least
power7.  The patch is to make it illega to use -mvsx to raise the cpu level.

> 
> > -#pragma GCC target("vsx")
> > +/* cpu=power7 must be used to enable VSX.  */
> > +#pragma GCC target("cpu=power7,vsx")
> 
> Is there a reason you're adding -mvsx too, since the -mcpu=power7
> should enable VSX implicitly. ...or does the -mno-altivec -mno-vsx
> in the dg-options stick around so we beed to override them with
> the explicit -mvsx?

I was just trying to make the test case work.  I just added the cpu=power7 into
the pragma.
> 
> 
> >for (i = 0; i < n; i++)
> > -a[i] = vec_add (b[i], c[i]);
> > +a[i] = b[i] + c[i];
> 
> Much better, thanks!  I dislike it when people use vector intrinsics
> when straight C code is cleaner and easier to read.
> 
> 
> 
> 
> 
> > diff --git a/gcc/testsuite/gcc.target/powerpc/pr115688.c 
> > b/gcc/testsuite/gcc.target/powerpc/pr115688.c
> > index 5222e66ef17..00c7c301436 100644
> > --- a/gcc/testsuite/gcc.target/powerpc/pr115688.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/pr115688.c
> > @@ -7,7 +7,8 @@
> >  
> >  /* Verify there is no ICE under 32 bit env.  */
> >  
> > -__attribute__((target("vsx")))
> > +/* cpu=power7 must be used to enable VSX.  */
> > +__attribute__((target("cpu=power7,vsx")))
> >  int test (void)
> >  {
> >return 0;
> 
> Same question as above.  Why the need for adding -mvsx here?
> 
> Peter
> 
> 
> 

-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com

Re: [PATCH V2 4/11] Change TARGET_POPCNTB to TARGET_POWER5

2024-11-17 Thread Michael Meissner

On Thu, Nov 14, 2024 at 06:26:11PM -0600, Peter Bergner wrote:
> On 11/8/24 1:49 PM, Michael Meissner wrote:
> > As part of the architecture flags patches, this patch changes the use of
> > TARGET_POPCNTB to TARGET_POWER5.  The POPCNTB instruction was added in ISA 
> > 2.02
> > (power5).
> 
> I like what this patch and the other related clean up patches are doing,
> namely changing the TARGET_ macros to TARGET_ which makes
> much more sense.  However, the way you ordered the patch series, this
> cleanup patch depends on the main patches that change us to using
> architecture flags, rather than the isa flags that require explicit
> machine options.
> 
> I'd prefer (and I think Segher will too) that these cleanup patches be
> done *before* your main patches that change us to using architecture
> flags.  That way they're independent of the main patches so if we had
> to revert those patches, then these cleanup patches would not have to
> be reverted too.
> 
> So I'm speaking of patches 4/11, 5/11. 7/11 and 8/11.  I don't see a
> 6/11.  Did you forget to email that?  Was that for changing TARGET_FOO
> to TARGET_POWER6?  If so, then that should be handled like patches
> 4 thru 8.

See the 4 patch sets:

Add more user friendly TARGET_ names for PowerPC
https://gcc.gnu.org/pipermail/gcc-patches/2024-November/669067.html

Add support for -mcpu=future in the PowerPC
https://gcc.gnu.org/pipermail/gcc-patches/2024-November/669099.html

Do not allow -mvsx to boost the cpu to power7
https://gcc.gnu.org/pipermail/gcc-patches/2024-November/669106.html

Separte PowerPC ISA bits from architecture bits set by -mcpu=
https://gcc.gnu.org/pipermail/gcc-patches/2024-November/669108.html

-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com

Re: [PATCH repost, 3/5] PowerPC: Switch to dense math names for all MMA operations

2024-11-17 Thread Michael Meissner

If we eliminate patches #3 (switch to dense math names for all MMA operations)
and patch #4 (add dense math test for new instruction) it will continue to
generate the power10 form of the shared instructions and not the future form
dense math registers.

-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com

[PATCH v2 12/14] Support for 64-bit location_t: Backend parts

2024-11-17 Thread Lewis Hyatt

A few targets have been using "unsigned int" function arguments that need to
receive a "location_t". Change to "location_t" to prepare for the
possibility that location_t can be configured to be a different type.

gcc/ChangeLog:

* config/aarch64/aarch64-c.cc (aarch64_resolve_overloaded_builtin):
Change "unsigned int" argument to "location_t".
* config/avr/avr-c.cc (avr_resolve_overloaded_builtin): Likewise.
* config/riscv/riscv-c.cc (riscv_resolve_overloaded_builtin): Likewise.
* target.def: Likewise.
* doc/tm.texi: Regenerate.
---
 gcc/config/aarch64/aarch64-c.cc | 3 +--
 gcc/config/avr/avr-c.cc | 3 +--
 gcc/config/riscv/riscv-c.cc | 3 +--
 gcc/doc/tm.texi | 2 +-
 gcc/target.def  | 2 +-
 5 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index faedb25ddb3..79a680f2e24 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -369,11 +369,10 @@ aarch64_pragma_aarch64 (cpp_reader *)
 
 /* Implement TARGET_RESOLVE_OVERLOADED_BUILTIN.  */
 static tree
-aarch64_resolve_overloaded_builtin (unsigned int uncast_location,
+aarch64_resolve_overloaded_builtin (location_t location,
tree fndecl, void *uncast_arglist)
 {
   vec empty = {};
-  location_t location = (location_t) uncast_location;
   vec *arglist = (uncast_arglist
   ? (vec *) uncast_arglist
   : &empty);
diff --git a/gcc/config/avr/avr-c.cc b/gcc/config/avr/avr-c.cc
index d3c40d73043..7cf8344c1c7 100644
--- a/gcc/config/avr/avr-c.cc
+++ b/gcc/config/avr/avr-c.cc
@@ -48,11 +48,10 @@ enum avr_builtin_id
 /* Implement `TARGET_RESOLVE_OVERLOADED_PLUGIN'.  */
 
 static tree
-avr_resolve_overloaded_builtin (unsigned int iloc, tree fndecl, void *vargs)
+avr_resolve_overloaded_builtin (location_t loc, tree fndecl, void *vargs)
 {
   tree type0, type1, fold = NULL_TREE;
   avr_builtin_id id = AVR_BUILTIN_COUNT;
-  location_t loc = (location_t) iloc;
   vec &args = * (vec*) vargs;
 
   switch (DECL_MD_FUNCTION_CODE (fndecl))
diff --git a/gcc/config/riscv/riscv-c.cc b/gcc/config/riscv/riscv-c.cc
index c59f408d3a8..7f78e2cf019 100644
--- a/gcc/config/riscv/riscv-c.cc
+++ b/gcc/config/riscv/riscv-c.cc
@@ -312,11 +312,10 @@ riscv_check_builtin_call (location_t loc, vec 
arg_loc, tree fndecl,
 
 /* Implement TARGET_RESOLVE_OVERLOADED_BUILTIN.  */
 static tree
-riscv_resolve_overloaded_builtin (unsigned int uncast_location, tree fndecl,
+riscv_resolve_overloaded_builtin (location_t loc, tree fndecl,
  void *uncast_arglist)
 {
   vec empty = {};
-  location_t loc = (location_t) uncast_location;
   vec *arglist = (vec *) uncast_arglist;
   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
   unsigned int subcode = code >> RISCV_BUILTIN_SHIFT;
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 109e40384b6..58a94822156 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -12115,7 +12115,7 @@ ignored.  This function should return the result of the 
call to the
 built-in function.
 @end deftypefn
 
-@deftypefn {Target Hook} tree TARGET_RESOLVE_OVERLOADED_BUILTIN (unsigned int 
@var{loc}, tree @var{fndecl}, void *@var{arglist})
+@deftypefn {Target Hook} tree TARGET_RESOLVE_OVERLOADED_BUILTIN (location_t 
@var{loc}, tree @var{fndecl}, void *@var{arglist})
 Select a replacement for a machine specific built-in function that
 was set up by @samp{TARGET_INIT_BUILTINS}.  This is done
 @emph{before} regular type checking, and so allows the target to
diff --git a/gcc/target.def b/gcc/target.def
index 523ae7ec9aa..e285cef5743 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2497,7 +2497,7 @@ arguments passed to the built-in function.  The result is 
a\n\
 complete expression that implements the operation, usually\n\
 another @code{CALL_EXPR}.\n\
 @var{arglist} really has type @samp{VEC(tree,gc)*}",
- tree, (unsigned int /*location_t*/ loc, tree fndecl, void *arglist), NULL)
+ tree, (location_t loc, tree fndecl, void *arglist), NULL)
 
 DEFHOOK
 (check_builtin_call,

[PATCH] PR target/108958: Use mtvsrdd to zero extend GPR DImode to VSX TImode

2024-11-17 Thread Michael Meissner

Previously GCC would zero externd a DImode GPR value to TImode by first zero
extending the DImode value into a GPR TImode value, and then do a MTVSRDD to
move this value to a VSX register.

This patch does the move directly, since if the middle argument to MTVSRDD is 0,
it does the zero extend.

If the DImode value is already in a vector register, it does a XXSPLTIB and
XXPERMDI to get the value into the bottom 64-bits of the register.

I have built GCC with the patches in this patch set applied on both little and
big endian PowerPC systems and there were no regressions.  Can I apply this
patch to GCC 15?

2024-11-17  Michael Meissner  

gcc/

PR target/108598
* gcc/config/rs6000/rs6000.md (zero_extendditi2): New insn.

gcc/testsuite/

PR target/108598
* gcc.target/powerpc/pr108958.c: New test.
---
 gcc/config/rs6000/rs6000.md | 46 +
 gcc/testsuite/gcc.target/powerpc/pr108958.c | 27 
 2 files changed, 73 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108958.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index d266f93ff2e..bfb02b07ef4 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1026,6 +1026,52 @@ (define_insn_and_split "*zero_extendsi2_dot2"
(set_attr "dot" "yes")
(set_attr "length" "4,8")])
 
+(define_insn_and_split "zero_extendditi2"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=r,wa,&wa")
+   (zero_extend:TI
+(match_operand:DI 1 "gpc_reg_operand" "rwa,r,wa")))]
+  "TARGET_P9_VECTOR && TARGET_POWERPC64"
+  "@
+  #
+  mtvsrdd %x0,0,%1
+  #"
+  "&& reload_completed
+   && (int_reg_operand (operands[0], TImode)
+   || vsx_register_operand (operands[1], DImode))"
+  [(set (match_dup 2)
+   (match_dup 3))
+   (set (match_dup 4)
+   (match_dup 5))]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  int r = reg_or_subregno (op0);
+
+  if (int_reg_operand (op0, TImode))
+{
+  int lo = BYTES_BIG_ENDIAN ? 1 : 0;
+  int hi = 1 - lo;
+
+  operands[2] = gen_rtx_REG (DImode, r + lo);
+  operands[3] = op1;
+  operands[4] = gen_rtx_REG (DImode, r + hi);
+  operands[5] = const0_rtx;
+}
+  else
+{
+  rtx op0_di = gen_rtx_REG (DImode, r);
+  rtx op0_v2di = gen_rtx_REG (V2DImode, r);
+  rtx lo = WORDS_BIG_ENDIAN ? op1 : op0_di;
+  rtx hi = WORDS_BIG_ENDIAN ? op0_di : op1;
+
+  operands[2] = op0_v2di;
+  operands[3] = CONST0_RTX (V2DImode);
+  operands[4] = op0_v2di;
+  operands[5] = gen_rtx_VEC_CONCAT (V2DImode, hi, lo);
+}
+}
+  [(set_attr "type" "*,mtvsr,vecperm")
+   (set_attr "length" "8,*,8")])
 
 (define_insn "extendqi2"
   [(set (match_operand:EXTQI 0 "gpc_reg_operand" "=r,?*v")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108958.c 
b/gcc/testsuite/gcc.target/powerpc/pr108958.c
new file mode 100644
index 000..03eb58d069e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108958.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
+
+/* PR target/108958, use mtvsrdd to zero extend gpr to vsx register.  */
+
+void
+gpr_to_vsx (unsigned long long x, __uint128_t *p)
+{
+  /* mtvsrdd vsx,0,gpr.  */
+  __uint128_t y = x;
+  __asm__ (" # %x0" : "+wa" (y));
+  *p = y;
+}
+
+void
+gpr_to_gpr (unsigned long long x, __uint128_t *p)
+{
+  /* mr and li.  */
+  __uint128_t y = x;
+  __asm__ (" # %0" : "+r" (y));
+  *p = y;
+}
+
+/* { dg-final { scan-assembler-times {\mli\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mmtvsrdd .*,0,.*\M} 1 } } */
-- 
2.47.0


-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com

Re: [PATCH v2] GCC Driver : Enable very long gcc command-line option

2024-11-17 Thread Andrew Pinski

On Tue, Sep 17, 2024, 3:40 AM Dora, Sunil Kumar <
sunilkumar.d...@windriver.com> wrote:

> Hi Andrew,
>
> Initially, I thought to address long command line options (when exceeding
> 128KB) without disrupting the existing GCC driver behavior.
>
> As you suggested, I implemented changes to use the response file format (
> @file) within the set_collect_gcc_options function and ensured that this
> was passed through COLLECT_GCC_OPTIONS.
> However, these changes have introduced a side effect: they impact the
> behavior of the -save-temps switch by generating additional .args.N files.
> As a result, some existing test cases, including the one reported by the
> Linaro team, are now failing.
> (File: Attached)
> Could you please advise on how we should proceed? Specifically, should we
> adjust the test cases to accommodate the impact on the -save-temps switch,
> or is there an alternative approach you would recommend? Your guidance on
> how to address these issues while implementing the response file approach
> would be greatly appreciated.
>
Sounds like the test case need to adjusted for the new files that are saved
now. Since we want to have this file around when using -saves-temps to be
able to reproduce what is being invoked.

Thanks,
Andrew


Thank you for your support.
>
>
>
> Thanks,
> Sunil Dora
> --
> *From:* Andrew Pinski 
> *Sent:* Friday, September 6, 2024 11:33 PM
> *To:* Dora, Sunil Kumar 
> *Cc:* Hemraj, Deepthi ; GCC Patches <
> gcc-patches@gcc.gnu.org>; Richard Guenther ; Jeff Law <
> jeffreya...@gmail.com>; josmy...@redhat.com ;
> MacLeod, Randy ; Gowda, Naveen <
> naveen.go...@windriver.com>
> *Subject:* Re: [PATCH v2] GCC Driver : Enable very long gcc command-line
> option
>
> * CAUTION: This email comes from a non Wind River email account!*
> Do not click links or open attachments unless you recognize the sender and
> know the content is safe.
>
>
> On Fri, Sep 6, 2024, 9:38 AM Dora, Sunil Kumar <
> sunilkumar.d...@windriver.com> wrote:
>
> Hi Andrew,
>
> Thank you for your feedback. Initially, we attempted to address the issue
> by utilizing GCC’s response files. However, we discovered that the
> COLLECT_GCC_OPTIONS variable already contains the expanded contents of
> the response files.
>
> As a result, using response files only mitigates the multiplication factor
> but does not bypass the 128KB limit.
>
>
> I think you missed understood me fully. What I was saying instead of
> creating a string inside set_collect_gcc_options, create the response file
> and pass that via COLLECT_GCC_OPTIONS with the @file format. And then
> inside collect2.cc
> 
> when using COLLECT_GCC_OPTIONS/extract_string instead read in the response
> file options if there was an @file instead of those 2 loops. This requires
> more than what you did. Oh and should be less memory hungry and maybe
> slightly faster.
>
> Thanks,
> Andrew
>
>
>
> I have included the response file usage logs and the complete history in
> the Bugzilla report for your reference: Bugzilla Link
> 
> .
> Following your suggestion, I have updated the logic to avoid hardcoding
> /tmp.
> Please find the revised version of patch at the following link:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662519.html
> 
>
> Thanks,
> Sunil Dora
> --
> *From:* Andrew Pinski 
> *Sent:* Friday, August 30, 2024 8:05 PM
> *To:* Hemraj, Deepthi 
> *Cc:* gcc-patches@gcc.gnu.org ; rguent...@suse.de
> ; jeffreya...@gmail.com ;
> josmy...@redhat.com ; MacLeod, Randy <
> randy.macl...@windriver.com>; Gowda, Naveen ;
> Dora, Sunil Kumar 
> *Subject:* Re: [PATCH v2] GCC Driver : Enable very long gcc command-line
> option
>
> CAUTION: This email comes from a non Wind River email account!
> Do not click links or open attachments unless you recognize the sender and
> know the content is safe.
>
> On Fri, Aug 30, 2024 at 12:34 AM  wrote:
> >
> > From: Deepthi Hemraj 
> >
> > For excessively long environment variables i.e >128KB
> > Store the arguments in a temporary file and collect them back together
> in collect2.
> >
> > This commit patches for COLLECT_GCC_OPTIONS issue:
> > GCC should not limit the length of command line passed to collect2.
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111527
>

[PATCH v2] c: Introduce -Wfree-labels

2024-11-17 Thread Florian Weimer

This is another recent GCC extension whose use is apparently
difficult to spot in code reviews.

The name of the option is due to Jonathan Wakely.  Part of it
could apply to C++ as well (for labels at the end of a compound
statement).

gcc/c-family/

* c-opts.cc (c_common_post_options): Initialize
warn_free_labels.
* c.opt (Wfree-labels): New option.
* c.opt.urls: Regenerate.

gcc/c/

* c-parser.cc (c_parser_compound_statement_nostart): Use
OPT_Wfree_labels for warning about labels on declarations.
(c_parser_compound_statement_nostart): Use OPT_Wfree_labels
for warning about labels at end of compound statements.

gcc/

* doc/invoke.texi: Document -Wfree-labels.

gcc/testsuite/

* gcc.dg/Wfree-labels-1.c: New test.
* gcc.dg/Wfree-labels-2.c: New test.
* gcc.dg/Wfree-labels-3.c: New test.

---
v2: Rebase on top of current trunk.
 gcc/c-family/c-opts.cc|  5 +
 gcc/c-family/c.opt|  4 
 gcc/c-family/c.opt.urls   |  3 +++
 gcc/c/c-parser.cc |  5 +++--
 gcc/doc/invoke.texi   | 15 +--
 gcc/testsuite/gcc.dg/Wfree-labels-1.c | 18 ++
 gcc/testsuite/gcc.dg/Wfree-labels-2.c | 18 ++
 gcc/testsuite/gcc.dg/Wfree-labels-3.c | 18 ++
 8 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/gcc/c-family/c-opts.cc b/gcc/c-family/c-opts.cc
index 3a3464ccc3f..a7149a85171 100644
--- a/gcc/c-family/c-opts.cc
+++ b/gcc/c-family/c-opts.cc
@@ -1006,6 +1006,11 @@ c_common_post_options (const char **pfilename)
   = ((pedantic && !flag_isoc23 && warn_c11_c23_compat != 0)
 || warn_c11_c23_compat > 0);
 
+  /* Likewise for -Wfree-labels.  */
+  if (warn_free_labels == -1)
+warn_free_labels = ((pedantic && !flag_isoc23 && warn_c11_c23_compat != 0)
+   || warn_c11_c23_compat > 0);
+
   if (warn_deprecated_non_prototype == -1)
 warn_deprecated_non_prototype = warn_c11_c23_compat > 0;
 
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index 220421accf4..4ccf3bbe6e6 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -832,6 +832,10 @@ Wframe-address
 C ObjC C++ ObjC++ Var(warn_frame_address) Warning LangEnabledBy(C ObjC C++ 
ObjC++,Wall)
 Warn when __builtin_frame_address or __builtin_return_address is used unsafely.
 
+Wfree-labels
+C ObjC Var(warn_free_labels) Init(-1) Warning
+Warn about labels on declarations and at the end of compound statements.
+
 Wglobal-module
 C++ ObjC++ Var(warn_global_module) Warning Init(1)
 Warn about the global module fragment not containing only preprocessing 
directives.
diff --git a/gcc/c-family/c.opt.urls b/gcc/c-family/c.opt.urls
index 91918c49204..d9904662b4b 100644
--- a/gcc/c-family/c.opt.urls
+++ b/gcc/c-family/c.opt.urls
@@ -421,6 +421,9 @@ UrlSuffix(gcc/Warning-Options.html#index-Wformat)
 Wframe-address
 UrlSuffix(gcc/Warning-Options.html#index-Wframe-address)
 
+Wfree-labels
+UrlSuffix(gcc/Warning-Options.html#index-Wfree-labels)
+
 Wglobal-module
 UrlSuffix(gcc/C_002b_002b-Dialect-Options.html#index-Wglobal-module)
 
diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index 70fbf940835..413eaae5fe3 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -7403,7 +7403,7 @@ c_parser_compound_statement_nostart (c_parser *parser)
   && (have_std_attrs = true)))
{
  if (last_label)
-   pedwarn_c11 (c_parser_peek_token (parser)->location, OPT_Wpedantic,
+   pedwarn_c11 (c_parser_peek_token (parser)->location, 
OPT_Wfree_labels,
 "a label can only be part of a statement and "
 "a declaration is not a statement");
  /* It's unlikely we'll see a nested loop in a declaration in
@@ -7550,7 +7550,8 @@ c_parser_compound_statement_nostart (c_parser *parser)
   parser->error = false;
 }
   if (last_label)
-pedwarn_c11 (label_loc, OPT_Wpedantic, "label at end of compound 
statement");
+pedwarn_c11 (label_loc, OPT_Wfree_labels,
+"label at end of compound statement");
   location_t endloc = c_parser_peek_token (parser)->location;
   c_parser_consume_token (parser);
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index c73025e2d0e..7ff7e694bc9 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -520,8 +520,8 @@ Objective-C and Objective-C++ Dialects}.
 }
 
 @item C and Objective-C-only Warning Options
-@gccoptlist{-Wbad-function-cast -Wdeprecated-non-prototype 
-Wmissing-declarations
--Wmissing-parameter-name -Wmissing-parameter-type
+@gccoptlist{-Wbad-function-cast -Wdeprecated-non-prototype -Wfree-labels
+-Wmissing-declarations -Wmissing-parameter-name -Wmissing-parameter-type
 -Wdeclaration-missing-parameter-type -Wmissing-prototypes
 -Wmissing-variable-declarations -Wnested-externs -Wold-style-declaration
 -Wold-style-definition -Wstrict-prototypes -Wtraditi

Re: [committed] c: Default to -std=gnu23

2024-11-17 Thread Sam James

Florian Weimer  writes:

> * Joseph Myers:
>
>> Change the default language version for C compilation from -std=gnu17
>> to -std=gnu23.  A few tests are updated to remove local definitions of
>> bool, true and false (where making such an unconditional test change
>> seemed to make more sense than changing the test conditionally earlier
>> or building it with -std=gnu17); most test issues were already
>> addressed in previous patches.  In the case of
>> ctf-function-pointers-2.c, it was agreed in bug 117289 that it would
>> be OK to put -std=gnu17 in the test and leave more optimal BTF / CTF
>> output for this test as a potential future improvement.
>>
>> Since the original test fixes, more such fixes have become necessary
>> and so are included in this patch.  More noinline attributes are added
>> to simulate-thread tests where () meaning a prototype affected test
>> results, while gcc.dg/torture/pr117496-1.c (a test declaring a
>> function with () then calling it with arguments) gets -std=gnu17
>> added.
>>
>> Bootstrapped with no regressions for x86_64-pc-linux-gnu.
>
> Has anyone performed experiments to determine the impact of this change
> on typical free software distributions?

I filed https://gcc.gnu.org/PR117298 for an issue Joseph noticed in one
of the GCC tests (that is actually an improvement, but a missed opt for
older standards). I haven't done any sort of testing but am curious
about it as well.

I could do such a test for code size en-masse (and perhaps maybe even
check where the image changed at all). Runtime performance is far harder
for me to do at scale though. We can use significant code size changes
as a proxy for interesting candidates to investigate though.

What are you thinking of?

>
> Thanks,
> Florian

thanks,
sam

[PATCH] match: Fix the `max==0` pattern for pointers [PR117646]

2024-11-17 Thread Andrew Pinski

For pointers I forgot that BIT_IOR_EXPR is not valid so when
I added the pattern to convert `max != 0` (r15-5356), GCC
would start to ICEing saying pointer types were not valid for
BIT_IOR_EXPR.
This fixes the problem by casting to the unsigned type of the
inner type. There was another way of fixing this to handling it
as `a == 0 & b == 0` but both match and reassoication (for pointers)
will then convert it back into the form I am creating here so
let's just use that form instead.

Bootstrapped and tested on x86_64-linux-gnu.

PR tree-optimization/117646

gcc/ChangeLog:

* match.pd (`max==0`): Add casts to `unsigned type`.

gcc/testsuite/ChangeLog:

* gcc.dg/torture/minmaxneeqptr-1.c: New test.

Signed-off-by: Andrew Pinski 
---
 gcc/match.pd  |  6 --
 .../gcc.dg/torture/minmaxneeqptr-1.c  | 21 +++
 2 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/torture/minmaxneeqptr-1.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 4bec24a21b2..f5181325f3b 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -4799,9 +4799,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
MAX (A, B) != 0 -> (A|B) != 0 iff unsigned.  */
 (for cmp (eq ne)
  (simplify
-  (cmp (max @0 @1) integer_zerop@2)
+  (cmp (max @0 @1) integer_zerop)
   (if (TYPE_UNSIGNED (TREE_TYPE (@0)))
-   (cmp (bit_ior @0 @1) @2
+   (with { tree utype = unsigned_type_for (TREE_TYPE (@0)); }
+(cmp (bit_ior (convert:utype @0) (convert:utype @1))
+ { build_zero_cst (utype); } )
 
 /* Undo fancy ways of writing max/min or other ?: expressions, like
a - ((a - b) & -(a < b))  and  a - (a - b) * (a < b) into (a < b) ? b : a.
diff --git a/gcc/testsuite/gcc.dg/torture/minmaxneeqptr-1.c 
b/gcc/testsuite/gcc.dg/torture/minmaxneeqptr-1.c
new file mode 100644
index 000..aa45722330f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/minmaxneeqptr-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+
+/* PR tree-optimization/117646 */
+
+int maxeq(char *a, char *b) {
+  char *p = a < b ? b : a;
+  return p == (void*)0;
+}
+int maxne(char *a, char *b) {
+  char *p = a < b ? b : a;
+  return p == (void*)0;
+}
+
+int mineq(char *a, char *b) {
+  char *p = a > b ? b : a;
+  return p == (void*)0;
+}
+int minne(char *a, char *b) {
+  char *p = a > b ? b : a;
+  return p == (void*)0;
+}
-- 
2.43.0

Re: [committed] c: Default to -std=gnu23

2024-11-17 Thread Florian Weimer

* Joseph Myers:

> Change the default language version for C compilation from -std=gnu17
> to -std=gnu23.  A few tests are updated to remove local definitions of
> bool, true and false (where making such an unconditional test change
> seemed to make more sense than changing the test conditionally earlier
> or building it with -std=gnu17); most test issues were already
> addressed in previous patches.  In the case of
> ctf-function-pointers-2.c, it was agreed in bug 117289 that it would
> be OK to put -std=gnu17 in the test and leave more optimal BTF / CTF
> output for this test as a potential future improvement.
>
> Since the original test fixes, more such fixes have become necessary
> and so are included in this patch.  More noinline attributes are added
> to simulate-thread tests where () meaning a prototype affected test
> results, while gcc.dg/torture/pr117496-1.c (a test declaring a
> function with () then calling it with arguments) gets -std=gnu17
> added.
>
> Bootstrapped with no regressions for x86_64-pc-linux-gnu.

Has anyone performed experiments to determine the impact of this change
on typical free software distributions?

Thanks,
Florian

Re: [PATCH] Fortran: add bounds-checking for ALLOCATE of CHARACTER with type-spec [PR53357]

2024-11-17 Thread Jerry D


On 11/17/24 2:21 PM, Harald Anlauf wrote:

Dear all,

the attached patch fixes a rejects-valid / rejects-potentially-valid code issue
for  ALLOCATE of CHARACTER with type-spec, and add character length checking
with -fcheck=bounds for the case at hand.  It also improves checking of
character function declarations and references slightly, using the diagnostics
of NAG as a guidance.

Some testcases popped up during regtesting, suggesting that one needs to be
careful not to generate too many false positives, so I decided to not spend
to much time on the FIXME's therein.  (Character length might be expressions
in an explicit interface and the actual declaration, where we don't have a
reliable way to compare.)

Regtested on x86_64-pc-linux-gnu.  OK for mainline?

Thanks,
Harald



Looks good, OK for mainline.

Jerry

[PATCH 04/15] testsuite: Expand coverage for unaligned memory stores

2024-11-17 Thread Maciej W. Rozycki

Expand coverage for unaligned memory stores, for the "insvmisalignM" 
patterns, for 2-byte, 4-byte, and 8-byte scalars, across byte alignments 
of 1, 2, 4 and byte misalignments within from 0 up to 7 (there's some 
redundancy there for the sake of simplicity of the test case), making 
sure all data is written and no data is changed outside the area meant 
to be written.

The test case has turned invaluable in verifying changes to the Alpha 
backend, but functionality covered is generic, so I have concluded this 
test qualifies for generic verification and does not have to be limited 
to the Alpha-specific subset of the testsuite.

gcc/testsuite/
* gcc.c-torture/execute/misalign.c: New file.
---
 gcc/testsuite/gcc.c-torture/execute/misalign.c |   84 +
 1 file changed, 84 insertions(+)

gcc-test-misaligned.diff
Index: gcc/gcc/testsuite/gcc.c-torture/execute/misalign.c
===
--- /dev/null
+++ gcc/gcc/testsuite/gcc.c-torture/execute/misalign.c
@@ -0,0 +1,84 @@
+typedef unsigned int __attribute__ ((mode (QI))) intw1_t;
+typedef unsigned int __attribute__ ((mode (HI))) intw2_t;
+typedef unsigned int __attribute__ ((mode (SI))) intw4_t;
+typedef unsigned int __attribute__ ((mode (DI))) intw8_t;
+
+#define MISALIGN_DEFINE_ONE(align, width, offset)  \
+  static void  \
+  misalign_check_one_ ## align ## width ## offset (void)   \
+{  \
+  static union \
+   {   \
+ intw1_t v[32];\
+ struct __attribute__ ((packed))   \
+   {   \
+ intw1_t o[8 + offset];\
+ intw ## width ## _t x;\
+   } x;\
+ intw ## align ## _t a;\
+   }   \
+  dst = {{ [0 ... 31] = 0xaa }};   \
+  static const union   \
+   {   \
+ intw1_t v[8]; \
+ intw ## width ## _t x;\
+   }   \
+  src = {{ 1, 2, 3, 4, 5, 6, 7, 8 }};  \
+  int i, j;
\
+   \
+  dst.x.x = src.x; \
+  asm ("" : : : "memory"); \
+  for (i = 0; i < 8 + offset; i++) \
+   if (dst.v[i] != 0xaa)   \
+ __builtin_abort ();   \
+  for (j = 0; i < 8 + offset + width; i++, j++)\
+   if (dst.v[i] != src.v[j])   \
+ __builtin_abort ();   \
+  for (; i < sizeof (dst.v); i++)  \
+   if (dst.v[i] != 0xaa)   \
+ __builtin_abort ();   \
+}
+
+#define MISALIGN_DEFINE_ONE_ALIGN_WIDTH(align, width)  \
+  MISALIGN_DEFINE_ONE (align, width, 1)
\
+  MISALIGN_DEFINE_ONE (align, width, 2)
\
+  MISALIGN_DEFINE_ONE (align, width, 3)
\
+  MISALIGN_DEFINE_ONE (align, width, 4)
\
+  MISALIGN_DEFINE_ONE (align, width, 5)
\
+  MISALIGN_DEFINE_ONE (align, width, 6)
\
+  MISALIGN_DEFINE_ONE (align, width, 7)
+
+MISALIGN_DEFINE_ONE_ALIGN_WIDTH (1, 2)
+MISALIGN_DEFINE_ONE_ALIGN_WIDTH (1, 4)
+MISALIGN_DEFINE_ONE_ALIGN_WIDTH (1, 8)
+MISALIGN_DEFINE_ONE_ALIGN_WIDTH (2, 4)
+MISALIGN_DEFINE_ONE_ALIGN_WIDTH (2, 8)
+MISALIGN_DEFINE_ONE_ALIGN_WIDTH (4, 8)
+
+#define MISALIGN_CHECK_ONE(align, width, offset)   \
+  misalign_check_one_ ## align ## width ## offset ();
+
+#define MISALIGN_CHECK_ONE_ALIGN_WIDTH(align, width)   \
+  do   \
+{

[PATCH] build: Discard obsolete references to $(GCC_PARTS)

2024-11-17 Thread Maciej W. Rozycki

The $(GCC_PARTS) variable was deleted with the Makefile rework in commit 
fa9585134f6f ("libgcc move to the top level")[1] back in 2007, and yet 
the Ada and Modula 2 frontends added references to this variable later 
on, with commit e972fd5281b7 ("[Ada] clean ups in Makefiles")[2] back in 
2011 and commit 1eee94d35177 ("Merge modula-2 front end onto gcc.") back 
in 2022 respectively.

I guess it's because the frontends lived too long externally.  Discard 
the references then, they serve no purpose nowadays.

References:

[1] 


[2] 


gcc/ada/
* gcc-interface/Make-lang.in (gnattools): Remove $(GCC_PARTS).

gcc/m2/
* Make-lang.in (m2 modula-2 modula2): Remove $(GCC_PARTS).
---
 NB references only given to legacy commits that have no proper change 
heading to refer to with git.
---
 gcc/ada/gcc-interface/Make-lang.in |2 +-
 gcc/m2/Make-lang.in|3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

gcc-gcc-parts.diff
Index: gcc/gcc/ada/gcc-interface/Make-lang.in
===
--- gcc.orig/gcc/ada/gcc-interface/Make-lang.in
+++ gcc/gcc/ada/gcc-interface/Make-lang.in
@@ -793,7 +793,7 @@ gnatbind$(exeext): ada/b_gnatb.o $(CONFI
+$(GCC_LINK) -o $@ $(CFLAGS) ada/b_gnatb.o $(GNATBIND_OBJS) 
$(EXTRA_HOST_OBJS) ggc-none.o libcommon-target.a $(LIBS) $(SYSLIBS) $(GNATLIB)
 
 # use target-gcc target-gnatmake target-gnatbind target-gnatlink
-gnattools: $(GCC_PARTS) $(CONFIG_H) prefix.o force
+gnattools: $(CONFIG_H) prefix.o force
$(MAKE) -C ada $(ADA_TOOLS_FLAGS_TO_PASS) gnattools1
$(MAKE) -C ada $(ADA_TOOLS_FLAGS_TO_PASS) gnattools2
 
Index: gcc/gcc/m2/Make-lang.in
===
--- gcc.orig/gcc/m2/Make-lang.in
+++ gcc/gcc/m2/Make-lang.in
@@ -65,8 +65,7 @@ RSTSRC =  $(srcdir)/doc/gm2.texi \
   m2/Builtins.rst
 
 # Define the names for selecting modula-2 in LANGUAGES.
-m2 modula-2 modula2: gm2$(exeext) xgcc$(exeext) cc1gm2$(exeext) \
- $(GCC_PASSES) $(GCC_PARTS)
+m2 modula-2 modula2: gm2$(exeext) xgcc$(exeext) cc1gm2$(exeext) $(GCC_PASSES)
 m2.serial = cc1gm2$(exeext)
 
 m2.tags: force

[PATCH 03/15] testsuite: Expand coverage for `__builtin_memset' with 0

2024-11-17 Thread Maciej W. Rozycki

Expand coverage for `__builtin_memset' for the special case of clearing 
a block, primarily for "setmemM" block set pattern, though with smaller 
sizes open-coded sequences may be produced instead.

This verifies block sizes in bytes from 1 to 64 across byte alignments 
of 1, 2, 4, 8 and byte misalignments within from 0 up to 7 (there's some 
redundancy there for the sake of simplicity of the test case), making 
sure all the intended area is cleared and no data is changed outside it.

These choice of the ranges for the parameters has come from the Alpha 
backend, whose "setmemM" pattern has various corner cases related to 
base alignment and the misalignment within.

The test case has turned invaluable in verifying changes to the Alpha 
backend, but functionality covered is generic, so I have concluded this 
test qualifies for generic verification and does not have to be limited 
to the Alpha-specific subset of the testsuite.

Just as with `__builtin_memcpy' tests this code turned out to require 
quite a lot of time to compile, although a bit less than the former.

Example compilation times with reasonably fast POWER9@2.166GHz at `-O2' 
optimization and GCC built at `-O2' for various targets:

mips-linux-gnu:19s
vax-netbsdelf: 27s
alphaev56-linux-gnu:   30s
alpha-linux-gnu:   31s
powerpc64le-linux-gnu: 47s

With GCC built at `-O0':

alphaev56-linux-gnu: 2m59s
alpha-linux-gnu: 3m06s

I have therefore set the timeout factor accordingly so as to take slower
test hosts into account.

gcc/testsuite/
* gcc.c-torture/execute/memclr.c: New file.
---
 gcc/testsuite/gcc.c-torture/execute/memclr.c |  231 +++
 1 file changed, 231 insertions(+)

gcc-test-memclr.diff
Index: gcc/gcc/testsuite/gcc.c-torture/execute/memclr.c
===
--- /dev/null
+++ gcc/gcc/testsuite/gcc.c-torture/execute/memclr.c
@@ -0,0 +1,231 @@
+/* { dg-timeout-factor 4 } */
+
+typedef unsigned int __attribute__ ((mode (QI))) int08_t;
+typedef unsigned int __attribute__ ((mode (HI))) int16_t;
+typedef unsigned int __attribute__ ((mode (SI))) int32_t;
+typedef unsigned int __attribute__ ((mode (DI))) int64_t;
+
+typedef union
+  {
+int08_t v[88];
+  }
+a1_t;
+
+typedef union
+  {
+int08_t v[88];
+int16_t a;
+  }
+a2_t;
+
+typedef union
+  {
+int08_t v[88];
+int32_t a;
+  }
+a4_t;
+
+typedef union
+  {
+int08_t v[88];
+int64_t a;
+  }
+a8_t;
+
+#define MEMCLR_DEFINE_ONE(align, offset, count)
\
+  static void __attribute__ ((noinline))   \
+  memclr_check_one_ ## align ## offset ## count (void) \
+{  \
+  static a ## align ## _t dst = {{ [0 ... 87] = 0xaa }};   \
+  int i;   \
+   \
+  __builtin_memset (dst.v + 8 + offset, 0, count); \
+  asm ("" : : : "memory"); \
+  for (i = 0; i < 8 + offset; i++) \
+   if (dst.v[i] != 0xaa)   \
+ __builtin_abort ();   \
+  for (; i < 8 + offset + count; i++)  \
+   if (dst.v[i] != 0x00)   \
+ __builtin_abort ();   \
+  for (; i < sizeof (dst.v); i++)  \
+   if (dst.v[i] != 0xaa)   \
+ __builtin_abort ();   \
+}
+
+#define MEMCLR_DEFINE_ONE_ALIGN_OFFSET(align, offset)  \
+  MEMCLR_DEFINE_ONE (align, offset,  1)
\
+  MEMCLR_DEFINE_ONE (align, offset,  2)
\
+  MEMCLR_DEFINE_ONE (align, offset,  3)
\
+  MEMCLR_DEFINE_ONE (align, offset,  4)
\
+  MEMCLR_DEFINE_ONE (align, offset,  5)
\
+  MEMCLR_DEFINE_ONE (align, offset,  6)
\
+  MEMCLR_DEFINE_ONE (align, offset,  7)
\
+  MEMCLR_DEFINE_ONE (align, offset,  8)
\
+  MEMCLR_DEFINE_ONE (align, offset,  9)
\
+  MEMCLR_DEFINE_ONE (align, offset, 10)
\
+  MEMCLR_DEFINE_ONE (align, offset, 11)
\
+  MEMCLR_DEFINE_ONE (align, offset, 12)
\
+  MEMCLR_DEFINE_ONE (align, offset, 13)
\
+  MEM

[PATCH 02/15] testsuite: Expand coverage for `__builtin_memcpy'

2024-11-17 Thread Maciej W. Rozycki

Expand coverage for `__builtin_memcpy', primarily for "cpymemM" block 
copy pattern, although with smaller sizes open-coded sequences may be 
produced instead.

This verifies block sizes in bytes from 1 to 64, across byte alignments 
of 1, 2, 4, 8 and byte misalignments within from 0 up to 7 (there's some 
redundancy there for the sake of simplicity of the test cases) both for 
the source and the destination, making sure all data is copied and no 
data is changed outside the area meant to be written.

These choice of the ranges for the parameters has come from the Alpha 
backend, whose "cpymemM" pattern covers copies being made of up to 64 
bytes and has various corner cases related to base alignment and the 
misalignment within.

The test cases have turned invaluable in verifying changes to the Alpha 
backend, but functionality covered is generic, so I have concluded these 
tests qualify for generic verification and do not have to be limited to 
the Alpha-specific subset of the testsuite.

On the implementation side the tests turned out being quite stressful to 
GCC and the original simpler version that just expanded all code inline 
took a lot of time to complete compilation.  Depending on the target and 
compilation options elapsed times up to 40 minutes (!) have been seen, 
especially with GCC built at `-O0' for debugging purposes.

At the cost of increased complexity where a pair of macros is required 
per variant rather than just one I have split the code into individual 
functions forced not to be inlined and it improved compilation times 
considerably without losing coverage.

Example compilation times with reasonably fast POWER9@2.166GHz at `-O2' 
optimization and GCC built at `-O2' for various targets:

mips-linux-gnu:23s
vax-netbsdelf: 29s
alphaev56-linux-gnu:   39s
alpha-linux-gnu:   43s
powerpc64le-linux-gnu: 48s

With GCC built at `-O0':

alphaev56-linux-gnu: 3m37s
alpha-linux-gnu: 3m54s

I have therefore set the timeout factor accordingly so as to take slower 
test hosts into account.

gcc/testsuite/
* gcc.c-torture/execute/memcpy-a1.c: New file.
* gcc.c-torture/execute/memcpy-a2.c: New file.
* gcc.c-torture/execute/memcpy-a4.c: New file.
* gcc.c-torture/execute/memcpy-a8.c: New file.
* gcc.c-torture/execute/memcpy-ax.h: New file.
---
 gcc/testsuite/gcc.c-torture/execute/memcpy-a1.c |4 
 gcc/testsuite/gcc.c-torture/execute/memcpy-a2.c |4 
 gcc/testsuite/gcc.c-torture/execute/memcpy-a4.c |4 
 gcc/testsuite/gcc.c-torture/execute/memcpy-a8.c |4 
 gcc/testsuite/gcc.c-torture/execute/memcpy-ax.h |  243 
 5 files changed, 259 insertions(+)

gcc-test-memcpy.diff
Index: gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-a1.c
===
--- /dev/null
+++ gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-a1.c
@@ -0,0 +1,4 @@
+/* { dg-timeout-factor 8 } */
+
+#define ax_t a1_t
+#include "memcpy-ax.h"
Index: gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-a2.c
===
--- /dev/null
+++ gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-a2.c
@@ -0,0 +1,4 @@
+/* { dg-timeout-factor 8 } */
+
+#define ax_t a2_t
+#include "memcpy-ax.h"
Index: gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-a4.c
===
--- /dev/null
+++ gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-a4.c
@@ -0,0 +1,4 @@
+/* { dg-timeout-factor 8 } */
+
+#define ax_t a4_t
+#include "memcpy-ax.h"
Index: gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-a8.c
===
--- /dev/null
+++ gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-a8.c
@@ -0,0 +1,4 @@
+/* { dg-timeout-factor 8 } */
+
+#define ax_t a8_t
+#include "memcpy-ax.h"
Index: gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-ax.h
===
--- /dev/null
+++ gcc/gcc/testsuite/gcc.c-torture/execute/memcpy-ax.h
@@ -0,0 +1,243 @@
+typedef unsigned int __attribute__ ((mode (QI))) int08_t;
+typedef unsigned int __attribute__ ((mode (HI))) int16_t;
+typedef unsigned int __attribute__ ((mode (SI))) int32_t;
+typedef unsigned int __attribute__ ((mode (DI))) int64_t;
+
+typedef union
+  {
+int08_t v[88];
+  }
+a1_t;
+
+typedef union
+  {
+int08_t v[88];
+int16_t a;
+  }
+a2_t;
+
+typedef union
+  {
+int08_t v[88];
+int32_t a;
+  }
+a4_t;
+
+typedef union
+  {
+int08_t v[88];
+int64_t a;
+  }
+a8_t;
+
+ax_t src = {{
+  0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+  0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+  0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+  0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+  0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
+  0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
+  0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+  0x39, 0x3a, 0x3b, 0x3c

[PATCH 06/15] Alpha: Remove code duplication in block clear trailer

2024-11-17 Thread Maciej W. Rozycki

Remove code duplication in the part of `alpha_expand_block_clear' that 
handles any aligned trailing part of the block, observing that the two 
legs of code only differ by the machine mode and that we already take 
the same approach with handling any unaligned prefix earlier on.  No 
functional change, just code shuffling.

gcc/config/
* alpha/alpha.cc (alpha_expand_block_clear): Fold two legs of a 
conditional together.
---
 gcc/config/alpha/alpha.cc |   41 -
 1 file changed, 12 insertions(+), 29 deletions(-)

gcc-alpha-block-clear-tail-fold.diff
Index: gcc/gcc/config/alpha/alpha.cc
===
--- gcc.orig/gcc/config/alpha/alpha.cc
+++ gcc/gcc/config/alpha/alpha.cc
@@ -4236,40 +4236,23 @@ alpha_expand_block_clear (rtx operands[]
 
   /* If we have appropriate alignment (and it wouldn't take too many
  instructions otherwise), mask out the bytes we need.  */
-  if (TARGET_BWX ? words > 2 : bytes > 0)
+  if ((TARGET_BWX ? words > 2 : bytes > 0)
+  && (align >= 64 || (align >= 32 && bytes < 4)))
 {
-  if (align >= 64)
-   {
- rtx mem, tmp;
- HOST_WIDE_INT mask;
-
- mem = adjust_address (orig_dst, DImode, ofs);
- set_mem_alias_set (mem, 0);
-
- mask = HOST_WIDE_INT_M1U << (bytes * 8);
-
- tmp = expand_binop (DImode, and_optab, mem, GEN_INT (mask),
- NULL_RTX, 1, OPTAB_WIDEN);
-
- emit_move_insn (mem, tmp);
- return 1;
-   }
-  else if (align >= 32 && bytes < 4)
-   {
- rtx mem, tmp;
- HOST_WIDE_INT mask;
+  machine_mode mode = (align >= 64 ? DImode : SImode);
+  rtx mem, tmp;
+  HOST_WIDE_INT mask;
 
- mem = adjust_address (orig_dst, SImode, ofs);
- set_mem_alias_set (mem, 0);
+  mem = adjust_address (orig_dst, mode, ofs);
+  set_mem_alias_set (mem, 0);
 
- mask = HOST_WIDE_INT_M1U << (bytes * 8);
+  mask = HOST_WIDE_INT_M1U << (bytes * 8);
 
- tmp = expand_binop (SImode, and_optab, mem, GEN_INT (mask),
- NULL_RTX, 1, OPTAB_WIDEN);
+  tmp = expand_binop (mode, and_optab, mem, GEN_INT (mask),
+ NULL_RTX, 1, OPTAB_WIDEN);
 
- emit_move_insn (mem, tmp);
- return 1;
-   }
+  emit_move_insn (mem, tmp);
+  return 1;
 }
 
   if (!TARGET_BWX && bytes >= 4)

[PATCH 07/15] Alpha: Adjust MEM alignment for block clear [PR115459]

2024-11-17 Thread Maciej W. Rozycki

By inference it appears to me that the same fix for PR target/115459 
needs to be applied to the block clear operation that has been done for 
block move, as implemented by commit ccfe71518039 ("[alpha] adjust MEM 
alignment for block move [PR115459]").

gcc/
PR target/115459
* config/alpha/alpha.cc (alpha_expand_block_clear): Adjust MEM 
to match inferred alignment.
---
 gcc/config/alpha/alpha.cc |6 ++
 1 file changed, 6 insertions(+)

gcc-alpha-pr115459-clear.diff
Index: gcc/gcc/config/alpha/alpha.cc
===
--- gcc.orig/gcc/config/alpha/alpha.cc
+++ gcc/gcc/config/alpha/alpha.cc
@@ -4076,6 +4076,12 @@ alpha_expand_block_clear (rtx operands[]
   else if (a >= 16)
align = a, alignofs = 2 - c % 2;
}
+
+  if (MEM_P (orig_dst) && MEM_ALIGN (orig_dst) < align)
+   {
+ orig_dst = shallow_copy_rtx (orig_dst);
+ set_mem_align (orig_dst, align);
+   }
 }
 
   /* Handle an unaligned prefix first.  */

[PATCH 09/15] Alpha: Also use tree information to get base block alignment

2024-11-17 Thread Maciej W. Rozycki

We hardly ever emit code using machine instructions for aligned memory 
accesses for block move and clear operation and the reason for this 
appears to be that suboptimal alignment is often passed by the caller 
and then we only try to find a better alignment by checking pseudo 
register pointer alignment information, and from observation it's most 
often only set for stack frame references.

This code originates from before Tree SSA days and we can do better 
nowadays, by looking up the original tree node associated with a MEM 
RTL, so implement this approach, factoring out repeating code from 
`alpha_expand_block_move' and `alpha_expand_block_clear' to a new 
function.

In some cases howewer tree information is not available while pointer 
alignment is, such as with the case concerned with PR target/115459,
where we have:

(gdb) pr orig_src
(mem:BLK (plus:DI (reg/f:DI 65 virtual-stack-vars [ lock.206_2 ])
(const_int 8368 [0x20b0])) [8  S18 A8])
(gdb) pr orig_dst
(mem/j/c:BLK (plus:DI (reg/f:DI 65 virtual-stack-vars [ lock.206_2 ])
(const_int 8208 [0x2010])) [8 MEM[(struct 
gnat__debug_pools__print_info_stdout__internal__L_18__B1182b__S1183b___PAD 
*)_339].F[1 ...]{lb: 1 sz: 1}+0 S18 A128])
(gdb) 

showing no tree information and the alignment of 8 only for `orig_src', 
while indeed REGNO_POINTER_ALIGN returns 128 for pseudo 65.  So retain 
the old approach and return the largest alignment determined and its 
associated offset.

Add test cases accordingly and remove XFAILs from memclr-a2-o1-c9-ptr.c 
now that it does get aligned code produced now.

gcc/
* config/alpha/alpha.cc 
(alpha_get_mem_rtx_alignment_and_offset): New function.
(alpha_expand_block_move, alpha_expand_block_clear): Use it for 
alignment retrieval.

gcc/testsuite/
* gcc.target/alpha/memclr-a2-o1-c9-ptr.c: Remove XFAILs.
* gcc.target/alpha/memcpy-di-aligned.c: New file.
* gcc.target/alpha/memcpy-di-unaligned.c: New file.
* gcc.target/alpha/memcpy-di-unaligned-dst.c: New file.
* gcc.target/alpha/memcpy-di-unaligned-src.c: New file.
---
 gcc/config/alpha/alpha.cc|  158 +--
 gcc/testsuite/gcc.target/alpha/memclr-a2-o1-c9-ptr.c |   10 
 gcc/testsuite/gcc.target/alpha/memcpy-di-aligned.c   |   16 +
 gcc/testsuite/gcc.target/alpha/memcpy-di-unaligned-dst.c |   16 +
 gcc/testsuite/gcc.target/alpha/memcpy-di-unaligned-src.c |   15 +
 gcc/testsuite/gcc.target/alpha/memcpy-di-unaligned.c |   51 
 6 files changed, 205 insertions(+), 61 deletions(-)

gcc-alpha-mem-object-alignment.diff
Index: gcc/gcc/config/alpha/alpha.cc
===
--- gcc.orig/gcc/config/alpha/alpha.cc
+++ gcc/gcc/config/alpha/alpha.cc
@@ -3771,6 +3771,78 @@ alpha_expand_unaligned_store_words (rtx
   emit_move_insn (st_addr_1, st_tmp_1);
 }
 
+/* Get the base alignment and offset of EXPR in A and O respectively.
+   Check for any pseudo register pointer alignment and for any tree
+   node information and return the largest alignment determined and
+   its associated offset.  */
+
+static void
+alpha_get_mem_rtx_alignment_and_offset (rtx expr, int &a, HOST_WIDE_INT &o)
+{
+  HOST_WIDE_INT tree_offset = 0, reg_offset = 0, mem_offset = 0;
+  int tree_align = 0, reg_align = 0, mem_align = MEM_ALIGN (expr);
+
+  gcc_assert (MEM_P (expr));
+
+  rtx addr = XEXP (expr, 0);
+  switch (GET_CODE (addr))
+{
+case REG:
+  reg_align = REGNO_POINTER_ALIGN (REGNO (addr));
+  break;
+
+case PLUS:
+  if (REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
+   {
+ reg_offset = INTVAL (XEXP (addr, 1));
+ reg_align = REGNO_POINTER_ALIGN (REGNO (XEXP (addr, 0)));
+   }
+  break;
+
+default:
+  break;
+}
+
+  tree mem = MEM_EXPR (expr);
+  if (mem != NULL_TREE)
+switch (TREE_CODE (mem))
+  {
+  case MEM_REF:
+   tree_offset = mem_ref_offset (mem).force_shwi ();
+   tree_align = get_object_alignment (get_base_address (mem));
+   break;
+
+  case COMPONENT_REF:
+   {
+ tree byte_offset = component_ref_field_offset (mem);
+ tree bit_offset = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (mem, 1));
+ poly_int64 offset;
+ if (!byte_offset
+ || !poly_int_tree_p (byte_offset, &offset)
+ || !tree_fits_shwi_p (bit_offset))
+   break;
+ tree_offset = offset + tree_to_shwi (bit_offset) / BITS_PER_UNIT;
+   }
+   tree_align = get_object_alignment (get_base_address (mem));
+   break;
+
+  default:
+   break;
+  }
+
+  if (reg_align > mem_align)
+{
+  mem_offset = reg_offset;
+  mem_align = reg_align;
+}
+  if (tree_align > mem_align)
+{
+  mem_offset = tree_offset;
+  mem_align = tree_align;
+}
+  o = mem_offset;
+  a = mem_align;
+}
 
 /* Expand string/block move operations.
 
@@ -379

[PATCH 10/15] Alpha: Optimize block moves coming from longword-aligned source

2024-11-17 Thread Maciej W. Rozycki

Now that we have proper alignment determination for block moves in place 
the case of copying a block of longword-aligned data has become real, so 
implement the merging of loaded data from pairs of SImode registers into 
single DImode registers for the purpose of using with unaligned stores 
efficiently, as suggested by a comment in `alpha_expand_block_move' and 
discard the comment.  Provide test cases accordingly.

gcc/
* config/alpha/alpha.cc (alpha_expand_block_move): Merge loaded
data from pairs of SImode registers into single DImode registers 
if to be used with unaligned stores.

gcc/testsuite/
* gcc.target/alpha/memcpy-si-aligned.c: New file.
* gcc.target/alpha/memcpy-si-unaligned.c: New file.
* gcc.target/alpha/memcpy-si-unaligned-dst.c: New file.
* gcc.target/alpha/memcpy-si-unaligned-src.c: New file.
* gcc.target/alpha/memcpy-si-unaligned-src-bwx.c: New file.
---
 gcc/config/alpha/alpha.cc|   45 +++--
 gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c   |   16 +++
 gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-dst.c |   16 +++
 gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src-bwx.c |   11 ++
 gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src.c |   15 +++
 gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned.c |   51 +++
 6 files changed, 146 insertions(+), 8 deletions(-)

gcc-alpha-block-move-si-unaligned.diff
Index: gcc/gcc/config/alpha/alpha.cc
===
--- gcc.orig/gcc/config/alpha/alpha.cc
+++ gcc/gcc/config/alpha/alpha.cc
@@ -3930,14 +3930,44 @@ alpha_expand_block_move (rtx operands[])
 {
   words = bytes / 4;
 
-  for (i = 0; i < words; ++i)
-   data_regs[nregs + i] = gen_reg_rtx (SImode);
+  /* Load an even quantity of SImode data pieces only.  */
+  unsigned int hwords = words / 2;
+  for (i = 0; i / 2 < hwords; ++i)
+   {
+ data_regs[nregs + i] = gen_reg_rtx (SImode);
+ emit_move_insn (data_regs[nregs + i],
+ adjust_address (orig_src, SImode, ofs + i * 4));
+   }
 
-  for (i = 0; i < words; ++i)
-   emit_move_insn (data_regs[nregs + i],
-   adjust_address (orig_src, SImode, ofs + i * 4));
+  /* If we'll be using unaligned stores, merge data from pairs
+of SImode registers into DImode registers so that we can
+store it more efficiently via quadword unaligned stores.  */
+  unsigned int j;
+  if (dst_align < 32)
+   for (i = 0, j = 0; i < words / 2; ++i, j = i * 2)
+ {
+   rtx hi = expand_simple_binop (DImode, ASHIFT,
+ data_regs[nregs + j + 1],
+ GEN_INT (32), NULL_RTX,
+ 1, OPTAB_WIDEN);
+   data_regs[nregs + i] = expand_simple_binop (DImode, IOR, hi,
+   data_regs[nregs + j],
+   NULL_RTX,
+   1, OPTAB_WIDEN);
+ }
+  else
+   j = i;
 
-  nregs += words;
+  /* Take care of any remaining odd trailing SImode data piece.  */
+  if (j < words)
+   {
+ data_regs[nregs + i] = gen_reg_rtx (SImode);
+ emit_move_insn (data_regs[nregs + i],
+ adjust_address (orig_src, SImode, ofs + j * 4));
+ ++i;
+   }
+
+  nregs += i;
   bytes -= words * 4;
   ofs += words * 4;
 }
@@ -4056,13 +4086,12 @@ alpha_expand_block_move (rtx operands[])
 }
 
   /* Due to the above, this won't be aligned.  */
-  /* ??? If we have more than one of these, consider constructing full
- words in registers and using alpha_expand_unaligned_store_words.  */
   while (i < nregs && GET_MODE (data_regs[i]) == SImode)
 {
   alpha_expand_unaligned_store (orig_dst, data_regs[i], 4, ofs);
   ofs += 4;
   i++;
+  gcc_assert (i == nregs || GET_MODE (data_regs[i]) != SImode);
 }
 
   if (dst_align >= 16)
Index: gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c
===
--- /dev/null
+++ gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+unsigned int aligned_src_si[17] = { [0 ... 16] = 0xeaebeced };
+unsigned int aligned_dst_si[17] = { [0 ... 16] = 0xdcdbdad9 };
+
+void
+memcpy_aligned_data_si (void)
+{
+  __builtin_memcpy (aligned_dst_si + 1, aligned_src_si + 1, 60);
+}
+
+/* { dg-final { scan-assembler-times "\\sldl\\s" 15 } } */
+/* { dg-final { scan-assembler-times "\\sstl\\s" 15 } } */
+/* { dg-final { scan-assembler-not "\\s(?:ldq_u|stq_u)\\s" } } */
Index: gcc/gcc/testsuite

[PATCH 08/15] Alpha: Fix offset adjustment in unaligned access helpers

2024-11-17 Thread Maciej W. Rozycki

Correct the offset adjustment made in the multi-word unaligned access 
helpers such that it is actually used by the unaligned load and store 
instructions, fixing a bug introduced with commit 1eb356b98df2 ("alpha 
gprel optimizations")[1] back in 2001, which replaced address changes 
made directly according to the argument of the MEM expression passed 
with one made according to an address previously extracted from said MEM 
expression.  The address is however incorrectly extracted from said MEM 
before an adjustment has been made to it for the offset supplied.

This bug is usually covered by the fact that our block move and clear 
operations are hardly ever provided with correct block alignment data 
and we also usually fail to fetch that information from the MEM supplied 
(although PR target/115459 shows it does happen sometimes).  Instead the 
bit alignment of 8 is usually conservatively used, meaning that a zero 
offset is passed to `alpha_expand_unaligned_store_words' and then code 
has been written such that neither `alpha_expand_unaligned_load_words' 
nor `alpha_expand_unaligned_store_words' cannot ever be called with 
nonzero offset from `alpha_expand_block_move'.

The only situation where `alpha_expand_unaligned_store_words' can be 
called with nonzero offset is from `alpha_expand_block_clear' with a BWX 
target for a misaligned block that has been embedded in a data object of 
a higher alignment such that there is a small unaligned prefix our code 
decides to handle so as to align further stores.

For instance it happens when a block clear is called for a block of 9 
bytes embedded at offset 1 in a structure aligned to a 2-byte word, as 
illustrated by the test case included.  Now this test case does not work 
without the change that comes next applied, because the backend cannot 
see the word alignment of the struct and uses the bit alignment of 8 
instead.

Should this change be swapped with the next one incorrect code such as:

stb $31,1($16)
lda $3,1($16)
ldq_u $2,8($16)
ldq_u $1,1($16)
mskqh $2,$3,$2
stq_u $2,8($16)
mskql $1,$3,$1
stq_u $1,1($16)

would be produced, where the unadjusted offsets of 1/8 can be seen with 
the LDQ_U/STQ_U operations along with byte masks calculated accordingly 
rather than the expected offsets of 2/9.  As a result the byte at the 
offset of 9 fails to get cleared.  In these circumstances this would 
also show as execution failures with the memclr.c test:

FAIL: gcc.c-torture/execute/memclr.c   -O1  execution test
FAIL: gcc.c-torture/execute/memclr.c   -Os  execution test

-- not at `-O0' though, as the higher alignment cannot be retrieved in 
that case, and then not at `-O2' or higher optimization levels either, 
because then we choose to open-code this block clear instead:

ldbu $1,0($16)
stw $31,8($16)
stq $1,0($16)

avoiding the bug in `alpha_expand_unaligned_store_words'.

I am leaving the pattern match test case XFAIL-ed here for documentation 
purposes and it will be un-XFAIL-ed along with the fix to retrieve the 
correct alignment.  The run test is of course never expected to fail.

References:

[1] 


gcc/
* config/alpha/alpha.cc (alpha_expand_unaligned_load_words): 
Move address extraction until after the MEM referred has been 
adjusted for the offset supplied.
(alpha_expand_unaligned_store_words): Likewise.

gcc/testsuite/
* gcc.target/alpha/memclr-a2-o1-c9-ptr.c: New file.
* gcc.target/alpha/memclr-a2-o1-c9-run.c: New file.
---
 gcc/config/alpha/alpha.cc|   16 +++---
 gcc/testsuite/gcc.target/alpha/memclr-a2-o1-c9-ptr.c |   50 +++
 gcc/testsuite/gcc.target/alpha/memclr-a2-o1-c9-run.c |   25 +
 3 files changed, 83 insertions(+), 8 deletions(-)

gcc-alpha-unaligned-words-adjust-address.diff
Index: gcc/gcc/config/alpha/alpha.cc
===
--- gcc.orig/gcc/config/alpha/alpha.cc
+++ gcc/gcc/config/alpha/alpha.cc
@@ -3625,10 +3625,6 @@ alpha_expand_unaligned_load_words (rtx *
   rtx sreg, areg, tmp, smema;
   HOST_WIDE_INT i;
 
-  smema = XEXP (smem, 0);
-  if (GET_CODE (smema) == LO_SUM)
-smema = force_reg (Pmode, smema);
-
   /* Generate all the tmp registers we need.  */
   for (i = 0; i < words; ++i)
 {
@@ -3640,6 +3636,10 @@ alpha_expand_unaligned_load_words (rtx *
   if (ofs != 0)
 smem = adjust_address (smem, GET_MODE (smem), ofs);
 
+  smema = XEXP (smem, 0);
+  if (GET_CODE (smema) == LO_SUM)
+smema = force_reg (Pmode, smema);
+
   /* Load up all of the source data.  */
   for (i = 0; i < words; ++i)
 {
@@ -3698,10 +3698,6 @@ alpha_expand_unaligned_store_words (rtx
   rtx st_addr_1, st_addr_2, dmema;
   HOST_WIDE_INT i;
 
-  dmema = XEXP (dmem, 0);
-  if (GET_CODE (dmema) == LO_SUM)
-dmema = f

[PATCH 14/15] Alpha: Add option to avoid data races for sub-longword memory stores

2024-11-17 Thread Maciej W. Rozycki

With non-BWX Alpha implementations we have a problem of data races where 
a 8-bit byte or 16-bit word quantity is to be written to memory in that 
in those cases we use an unprotected RMW access of a 32-bit longword or 
64-bit quadword width.  If contents of the longword or quadword accessed 
outside the byte or word to be written are changed midway through by a 
concurrent write executing on the same CPU such as by a signal handler 
or a parallel write executing on another CPU such as by another thread 
or via a shared memory segment, then the concluding write of the RMW 
access will clobber them.  This is especially important for the safety 
of RCU algorithms, but is otherwise an issue anyway.

To guard against these data races with byte and aligned word quantities 
introduce the `-msafe-bwa' command-line option (standing for Safe Byte & 
Word Access) that instructs the compiler to instead use an atomic RMW 
access sequence where byte and word memory access machine instructions 
are not available.  There is no change to code produced for BWX targets.

It would be sufficient for the secondary reload handle to use a pair of 
scratch registers, as requested by `reload_out', but it would end 
with poor code produced as one of the scratches would be occupied by 
data retrieved and the other one would have to be reloaded with repeated 
calculations, all within the LL/SC sequence.

Therefore I chose to add a dedicated `reload_out_safe_bwa' handler 
and ask for more scratches there by defining a 256-bit OI integer mode.  
While reload is documented in our manual to support an arbitrary number 
of scratches in reality it hasn't been implemented for IRA:

/* ??? It would be useful to be able to handle only two, or more than
   three, operands, but for now we can only handle the case of having
   exactly three: output, input and one temp/scratch.  */

and it seems to be the case for LRA as well.  Do what everyone else does 
then and just have one wide multi-register scratch.

I note that the atomic sequences emitted are suboptimal performance-wise 
as the looping branch for the unsuccessful completion of the sequence 
points backwards, which means it will be predicted as taken despite that 
in most cases it will fall through.  I do not see it as a deficiency of 
this change proposed as it takes care of recording that the branch is 
unlikely to be taken, by calling `alpha_emit_unlikely_jump'.  Therefore 
generic code elsewhere 
shou

Add test cases accordingly.

There are notable regressions between a plain `-mno-bwx' configuration 
and a `-mno-bwx -msafe-bwa' one:

FAIL: gcc.dg/torture/inline-mem-cpy-cmp-1.c   -O0  execution test
FAIL: gcc.dg/torture/inline-mem-cpy-cmp-1.c   -O1  execution test
FAIL: gcc.dg/torture/inline-mem-cpy-cmp-1.c   -O2  execution test
FAIL: gcc.dg/torture/inline-mem-cpy-cmp-1.c   -O3 -g  execution test
FAIL: gcc.dg/torture/inline-mem-cpy-cmp-1.c   -Os  execution test
FAIL: gcc.dg/torture/inline-mem-cpy-cmp-1.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  execution test
FAIL: gcc.dg/torture/inline-mem-cpy-cmp-1.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  execution test
FAIL: g++.dg/init/array25.C  -std=c++17 execution test
FAIL: g++.dg/init/array25.C  -std=c++98 execution test
FAIL: g++.dg/init/array25.C  -std=c++26 execution test

They come from the fact that these test cases play tricks with alignment 
and end up calling code that expects a reference to aligned data but is 
handed one to unaligned data.

This doesn't cause a visible problem with plain `-mno-bwx' code, because 
the resulting alignment exception is fixed up by Linux.  There's no such 
handling currently implemented for LDL_L or LDQ_L instructions (which 
are first in the sequence) and consequently the offender is issued with 
SIGBUS instead.  Suitable handling will be added to Linux to complement 
this change, so these regressions are seen as harmless and expected.

gcc/
* config/alpha/alpha-modes.def (OI): New integer mode.
* config/alpha/alpha-protos.h (alpha_expand_mov_safe_bwa): New 
prototype.
* config/alpha/alpha.cc (alpha_expand_mov_safe_bwa): New 
function.
(alpha_secondary_reload): Handle TARGET_SAFE_BWA.
* config/alpha/alpha.md (aligned_store_safe_bwa)
(unaligned_store_safe_bwa, reload_out_safe_bwa)
(reload_out_unaligned_safe_bwa): New expanders.
(mov, movcqi, reload_out_aligned): Handle 
TARGET_SAFE_BWA.
(reload_out): Guard against TARGET_SAFE_BWA.
* config/alpha/alpha.opt (msafe-bwa): New option.
* config/alpha/alpha.opt.urls: Regenerate.
* doc/invoke.texi (Option Summary, DEC Alpha Options): Document 
the new option.

gcc/testsuite/
* gcc.target/alpha/stb.c: New file.
* gcc.target/alpha/stb-bwa.c: New file.
* gcc.target/alpha/stb-bwx.c: New file.
* gcc.target/alpha/stba.c: New file.
* gcc.target/alpha/stba

[COMMITTED] Alpha: Remove leftover `;;' for "unaligned_store"

2024-11-17 Thread Maciej W. Rozycki

Remove stray `;;' from the middle of the introductory comment for the 
"unaligned_store" expander, clearly a leftover from a previous 
edition.

gcc/
* config/alpha/alpha.md (unaligned_store): Remove stray 
`;;'.
---
 Committed as obvious.
---
 gcc/config/alpha/alpha.md |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

gcc-alpha-unaligned-store-comment.diff
Index: gcc/gcc/config/alpha/alpha.md
===
--- gcc.orig/gcc/config/alpha/alpha.md
+++ gcc/gcc/config/alpha/alpha.md
@@ -4201,7 +4201,7 @@
 })
 
 ;; For the unaligned byte and halfword cases, we use code similar to that
-;; in the ;; Architecture book, but reordered to lower the number of registers
+;; in the Architecture book, but reordered to lower the number of registers
 ;; required.  Operand 0 is the address.  Operand 1 is the data to store.
 ;; Operands 2, 3, and 4 are DImode temporaries, where operands 2 and 4 may
 ;; be the same temporary, if desired.  If the address is in a register,

[PATCH 13/15] IRA+LRA: Let the backend request to split basic blocks

2024-11-17 Thread Maciej W. Rozycki

The next change for Alpha will produce extra labels and branches in 
reload, which in turn requires basic blocks to be split at completion. 
We do this already for functions that can trap, so just extend the 
arrangement with a flag for the backend to use whenever it finds it 
necessary.

gcc/
* function.h (struct function): Add 
`split_basic_blocks_after_reload' member.
* lra.cc (lra): Handle it.
* reload1.cc (reload): Likewise.
---
 NB I do hope such a small change for a new feature can be still accepted 
for reload despite its final days coming soon now.  This will unblock the 
Alpha port for progressing including in particular with the conversion to 
LRA (as far as I'm concerned).
---
 gcc/function.h |3 +++
 gcc/lra.cc |6 --
 gcc/reload1.cc |6 --
 3 files changed, 11 insertions(+), 4 deletions(-)

gcc-split-basic-blocks-after-reload.diff
Index: gcc/gcc/function.h
===
--- gcc.orig/gcc/function.h
+++ gcc/gcc/function.h
@@ -449,6 +449,9 @@ struct GTY(()) function {
   /* Set for artificial function created for [[assume (cond)]].
  These should be GIMPLE optimized, but not expanded to RTL.  */
   unsigned int assume_function : 1;
+
+  /* Nonzero if reload will have to split basic blocks.  */
+  unsigned int split_basic_blocks_after_reload : 1;
 };
 
 /* Add the decl D to the local_decls list of FUN.  */
Index: gcc/gcc/lra.cc
===
--- gcc.orig/gcc/lra.cc
+++ gcc/gcc/lra.cc
@@ -2594,8 +2594,10 @@ lra (FILE *f, int verbose)
 
   inserted_p = fixup_abnormal_edges ();
 
-  /* We've possibly turned single trapping insn into multiple ones.  */
-  if (cfun->can_throw_non_call_exceptions)
+  /* Split basic blocks if we've possibly turned single trapping insn
+ into multiple ones or otherwise the backend requested to do so.  */
+  if (cfun->can_throw_non_call_exceptions
+  || cfun->split_basic_blocks_after_reload)
 {
   auto_sbitmap blocks (last_basic_block_for_fn (cfun));
   bitmap_ones (blocks);
Index: gcc/gcc/reload1.cc
===
--- gcc.orig/gcc/reload1.cc
+++ gcc/gcc/reload1.cc
@@ -1272,8 +1272,10 @@ reload (rtx_insn *first, int global)
 
   inserted = fixup_abnormal_edges ();
 
-  /* We've possibly turned single trapping insn into multiple ones.  */
-  if (cfun->can_throw_non_call_exceptions)
+  /* Split basic blocks if we've possibly turned single trapping insn
+ into multiple ones or otherwise the backend requested to do so.  */
+  if (cfun->can_throw_non_call_exceptions
+  || cfun->split_basic_blocks_after_reload)
 {
   auto_sbitmap blocks (last_basic_block_for_fn (cfun));
   bitmap_ones (blocks);

[PATCH 12/15] Alpha: Export `emit_unlikely_jump' for a subsequent change to use

2024-11-17 Thread Maciej W. Rozycki

Rename `emit_unlikely_jump' function to `alpha_emit_unlikely_jump', so 
as to avoid namespace pollution, updating callers accordingly and export 
it for use in the machine description.  Make it return the insn emitted.

gcc/
* config/alpha/alpha-protos.h (alpha_emit_unlikely_jump): New 
prototype.
* config/alpha/alpha.cc (emit_unlikely_jump): Rename to...
(alpha_emit_unlikely_jump): ... this.  Return the insn emitted.
(alpha_split_atomic_op, alpha_split_compare_and_swap)
(alpha_split_compare_and_swap_12, alpha_split_atomic_exchange)
(alpha_split_atomic_exchange_12): Update call sites accordingly.
---
 gcc/config/alpha/alpha-protos.h |1 +
 gcc/config/alpha/alpha.cc   |   19 ++-
 2 files changed, 11 insertions(+), 9 deletions(-)

gcc-alpha-emit-unlikely-jump-export.diff
Index: gcc/gcc/config/alpha/alpha-protos.h
===
--- gcc.orig/gcc/config/alpha/alpha-protos.h
+++ gcc/gcc/config/alpha/alpha-protos.h
@@ -59,6 +59,7 @@ extern rtx alpha_expand_zap_mask (HOST_W
 extern void alpha_expand_builtin_vector_binop (rtx (*)(rtx, rtx, rtx),
   machine_mode,
   rtx, rtx, rtx);
+extern rtx alpha_emit_unlikely_jump (rtx, rtx);
 extern void alpha_expand_builtin_establish_vms_condition_handler (rtx, rtx);
 extern void alpha_expand_builtin_revert_vms_condition_handler (rtx);
 
Index: gcc/gcc/config/alpha/alpha.cc
===
--- gcc.orig/gcc/config/alpha/alpha.cc
+++ gcc/gcc/config/alpha/alpha.cc
@@ -4420,12 +4420,13 @@ alpha_expand_builtin_vector_binop (rtx (
 /* A subroutine of the atomic operation splitters.  Jump to LABEL if
COND is true.  Mark the jump as unlikely to be taken.  */
 
-static void
-emit_unlikely_jump (rtx cond, rtx label)
+rtx
+alpha_emit_unlikely_jump (rtx cond, rtx label)
 {
   rtx x = gen_rtx_IF_THEN_ELSE (VOIDmode, cond, label, pc_rtx);
   rtx_insn *insn = emit_jump_insn (gen_rtx_SET (pc_rtx, x));
   add_reg_br_prob_note (insn, profile_probability::very_unlikely ());
+  return insn;
 }
 
 /* Subroutines of the atomic operation splitters.  Emit barriers
@@ -4517,7 +4518,7 @@ alpha_split_atomic_op (enum rtx_code cod
   emit_insn (gen_store_conditional (mode, cond, mem, scratch));
 
   x = gen_rtx_EQ (DImode, cond, const0_rtx);
-  emit_unlikely_jump (x, label);
+  alpha_emit_unlikely_jump (x, label);
 
   alpha_post_atomic_barrier (model);
 }
@@ -4567,7 +4568,7 @@ alpha_split_compare_and_swap (rtx operan
   emit_insn (gen_rtx_SET (cond, x));
   x = gen_rtx_EQ (DImode, cond, const0_rtx);
 }
-  emit_unlikely_jump (x, label2);
+  alpha_emit_unlikely_jump (x, label2);
 
   emit_move_insn (cond, newval);
   emit_insn (gen_store_conditional
@@ -4576,7 +4577,7 @@ alpha_split_compare_and_swap (rtx operan
   if (!is_weak)
 {
   x = gen_rtx_EQ (DImode, cond, const0_rtx);
-  emit_unlikely_jump (x, label1);
+  alpha_emit_unlikely_jump (x, label1);
 }
 
   if (!is_mm_relaxed (mod_f))
@@ -4679,7 +4680,7 @@ alpha_split_compare_and_swap_12 (rtx ope
   emit_insn (gen_rtx_SET (cond, x));
   x = gen_rtx_EQ (DImode, cond, const0_rtx);
 }
-  emit_unlikely_jump (x, label2);
+  alpha_emit_unlikely_jump (x, label2);
 
   emit_insn (gen_mskxl (cond, scratch, mask, addr));
 
@@ -4691,7 +4692,7 @@ alpha_split_compare_and_swap_12 (rtx ope
   if (!is_weak)
 {
   x = gen_rtx_EQ (DImode, cond, const0_rtx);
-  emit_unlikely_jump (x, label1);
+  alpha_emit_unlikely_jump (x, label1);
 }
 
   if (!is_mm_relaxed (mod_f))
@@ -4731,7 +4732,7 @@ alpha_split_atomic_exchange (rtx operand
   emit_insn (gen_store_conditional (mode, cond, mem, scratch));
 
   x = gen_rtx_EQ (DImode, cond, const0_rtx);
-  emit_unlikely_jump (x, label);
+  alpha_emit_unlikely_jump (x, label);
 
   alpha_post_atomic_barrier (model);
 }
@@ -4805,7 +4806,7 @@ alpha_split_atomic_exchange_12 (rtx oper
   emit_insn (gen_store_conditional (DImode, scratch, mem, scratch));
 
   x = gen_rtx_EQ (DImode, scratch, const0_rtx);
-  emit_unlikely_jump (x, label);
+  alpha_emit_unlikely_jump (x, label);
 
   alpha_post_atomic_barrier (model);
 }

[committed] Improve ext-dce's ability to eliminate more extensions

2024-11-17 Thread Jeff Law

I was looking at a regression in ext-dce's behavior just before 
Cauldron.  Essentially a bugfix in ext-dce ended up causing us to fail 
to eliminate some useless extensions.


When we have a SUBREG object with SUBREG_PROMOTED_VAR* flags set, we 
generally have to be more conservative in how we process bit group 
liveness, making bits live that wouldn't obviously be live otherwise.


That's not always necessary though. For example, if we're storing a 
promoted subreg into memory, we may not care about those extra live bits 
on this instance of the subreg object (remember subregs are not 
shared!).  Essentially if the mode of the memory reference is not wider 
than the mode of the inner REG, then we can clear the promoted state 
which in turn may allow more extension elimination.


So at the start of ext-dce we do a simple pass over the IL and remove 
promoted subreg state when it's obviously safe to do so (memory stores 
when the modes allow it).  That prevents extra bits from being live and 
ultimately allows us to remove more useless extensions.


The testcase is in theory generic, but many targets won't have an 
opportunity to optimize this case.  So rather then build out a large 
inclusion/exclusion list, I've just made the test risc-v specific.


Bootstrapped and regression tested on aarch64, riscv64, s390x, etc in my 
tester.  Pushing to the trunk.


Jeff



commit beec291225be9b5e7a60b6818cf80224c343811d
Author: Jeff Law 
Date:   Sun Nov 17 16:44:09 2024 -0700

Improve ext-dce's ability to eliminate more extensions

I was looking at a regression in ext-dce's behavior just before Cauldron.
Essentially a bugfix in ext-dce ended up causing us to fail to eliminate 
some
useless extensions.

When we have a SUBREG object with SUBREG_PROMOTED_VAR* flags set, we 
generally
have to be more conservative in how we process bit group liveness, making 
bits
live that wouldn't obviously be live otherwise.

That's not always necessary though. For example, if we're storing a promoted
subreg into memory, we may not care about those extra live bits on this
instance of the subreg object (remember subregs are not shared!).  
Essentially
if the mode of the memory reference is not wider than the mode of the inner
REG, then we can clear the promoted state which in turn may allow more
extension elimination.

So at the start of ext-dce we do a simple pass over the IL and remove 
promoted
subreg state when it's obviously safe to do so (memory stores when the modes
allow it).  That prevents extra bits from being live and ultimately allows 
us
to remove more useless extensions.

The testcase is in theory generic, but many targets won't have an 
opportunity
to optimize this case.  So rather then build out a large inclusion/exclusion
list, I've just made the test risc-v specific.

Bootstrapped and regression tested on aarch64, riscv64, s390x, etc in my 
tester.

gcc/
* ext-dce.cc (maybe_clear_subreg_promoted_p): New function.
(ext_dce_execute): Call it.

gcc/testsuite
* gcc.target/riscv/ext-dce-1.c: New test.

diff --git a/gcc/ext-dce.cc b/gcc/ext-dce.cc
index 0ece37726c7..649d39fadf9 100644
--- a/gcc/ext-dce.cc
+++ b/gcc/ext-dce.cc
@@ -941,6 +941,38 @@ ext_dce_process_bb (basic_block bb)
 }
 }
 
+/* SUBREG_PROMOTED_VAR_P is set by the gimple->rtl optimizers and
+   is usually helpful.  However, in some cases setting the value when
+   it not strictly needed can cause this pass to miss optimizations.
+
+   Specifically consider (set (mem) (subreg (reg))).  If set in that
+   case it will cause more bit groups to be live for REG than would
+   be strictly necessary which in turn can inhibit extension removal.
+
+   So do a pass over the IL wiping the SUBREG_PROMOTED_VAR_P when it
+   is obviously not needed.  */
+
+static void
+maybe_clear_subreg_promoted_p (void)
+{
+  for (rtx_insn *insn = get_insns(); insn; insn = NEXT_INSN (insn))
+{
+  if (!NONDEBUG_INSN_P (insn))
+   continue;
+
+  rtx set = single_set (insn);
+  if (!set)
+   continue;
+
+  /* There may be other cases where we should clear, but for
+now, this is the only known case where it causes problems.  */
+  if (MEM_P (SET_DEST (set)) && SUBREG_P (SET_SRC (set))
+&& GET_MODE (SET_DEST (set)) <= GET_MODE (SUBREG_REG (SET_SRC (set
+   SUBREG_PROMOTED_VAR_P (SET_SRC (set)) = 0;
+}
+}
+
+
 /* We optimize away sign/zero extensions in this pass and replace
them with SUBREGs indicating certain bits are don't cares.
 
@@ -1077,6 +1109,9 @@ static bool ext_dce_rd_confluence_n (edge) { return true; 
}
 void
 ext_dce_execute (void)
 {
+  /* Some settings of SUBREG_PROMOTED_VAR_P are actively harmful
+ to this pass.  Clear it for those cases.  */
+  maybe_clear_subreg_promoted_p ();
   df_analyze ();
   ext_dce_init ();
 
diff --git a/gcc/testsuite/gcc

Re: [PATCH] testsuite: Fix pr101145inf*.c testcases [PR117494]

2024-11-17 Thread Jeff Law





On 11/17/24 2:48 PM, Andrew Pinski wrote:

Instead of doing a dg-run with a specific target check for linux.
Use signal as the effective-target since this requires the use
of ALARM signal to do the testing.
Also use check_vect in the main and renames main to main1 to make sure
we don't use the registers.

Tested on x86_64-linux-gnu.

PR testsuite/117494
gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr101145inf.c: Remove dg-do and replace
with dg-require-effective-target of signal.
* gcc.dg/vect/pr101145inf_1.c: Likewise.
* gcc.dg/vect/pr101145inf.inc: Rename main to main1
and mark as noinline.
Include tree-vect.h. Have main call check_vect and main1.

OK
jeff

Re: [pushed] doc: Streamline hppa*-hp-hpux11 installation instructions

2024-11-17 Thread Gerald Pfeifer

On Sun, 17 Nov 2024, John David Anglin wrote:
>> Dave, any chance you can go through the HP/UX installation docs and see
>> what else can be trimmed or needs an update?
> I went HP/UX installation docs and trimmed a bunch of stuff.  Okay?
> 
> Dave
> ---
> gcc:
>     * doc/install.texi (Specific) : Update anchor and heading
>     to reflect removal of 32-bit hppa support on HP-UX.   Trim 32-bit related
> text.

Nice! You are the subject matter expert and maintainer, so you don't need 
anyone else's approval. :-) That said, I read through the changes and they 
make sense to me. (Some of the removed bits I had wondered about.)

Can you please refer to 
  PR target/69374
in the commit message as follows?

gcc:
PR target/69374
* doc/install.texi (Specific) : Update anchor and...

Thank you,
Gerald

Re: [patch,avr] PR84211: Add a new post reload optimization pass

2024-11-17 Thread Georg-Johann Lay


Am 17.11.24 um 15:42 schrieb Georg-Johann Lay:

Here are some examples:


Here are the examples again, but unTABified for legibility:

   Without optimization  |   With optimization
     |   =

   long long fn_zero (void)
   {
  return 0;
   }

   ldi r18, 0 ;  movqi_insn  |   ldi r18, 0 ;  movqi_insn
   ldi r19, 0 ;  movqi_insn  |   ldi r19, 0 ;  movqi_insn
   ldi r20, 0 ;  movqi_insn  |   movw r20, r18  ;  *movhi
   ldi r21, 0 ;  movqi_insn  |
   ldi r22, 0 ;  movqi_insn  |   movw r22, r18  ;  *movhi
   ldi r23, 0 ;  movqi_insn  |
   ldi r24, 0 ;  movqi_insn  |   movw r24, r18  ;  *movhi
   ldi r25, 0 ;  movqi_insn  |
   ret   |   ret

   int fn_eq0 (char c)
   {
   return c == 0;
   }

   mov r18, r24;  movqi_insn |   mov r18, r24   ;  movqi_insn
   ldi r24, 1  ;  *movhi |   ldi r24, 1 ;  *movhi
   ldi r25, 0|   ldi r25, 0
   cp  r18, ZERO   ;  cmpqi3 |   cpse r18, ZERO ;  peephole
   breq .+4;  branch |
   ldi r24, 0  ;  *movhi |   ldi r24, 0 ;  movqi_insn
   ldi r25, 0|
   ret   |   ret

   unsigned fn_crc (unsigned x, unsigned y)
   {
   for (char i = 8; i--; x <<= 1)
   y ^= (x ^ y) & 0x80 ? 79u : 0u;
   return y;
   }

   movw r18, r24   ;  *movhi |  movw r18, r24;  *movhi
   movw r24, r22   ;  *movhi |  movw r24, r22;  *movhi
   ldi  r22, 8 ;  movqi_insn |  ldi  r22, 8  ;  movqi_insn
  .L13:  | .L13:
   movw r30, r18   ;  *movhi |  movw r30, r18;  *movhi
   eor  r30, r24   ;  *xorqi3|  eor  r30, r24;  *xorqi3
   eor  r31, r25   ;  *xorqi3|  eor  r31, r25;  *xorqi3
   mov  r20, r30   ;  *andhi3|  mov  r20, r30;  *andqi3
   andi r20, 1<<7|  andi r20, 1<<7
   clr  r21  |
   sbrs r30, 7 ;  *sbrx_branchhi |  sbrc r30, 7  ;  *sbrx_branchhi
   rjmp .+4  |
   ldi  r20, 79;  movqi_insn |  ldi  r20, 79 ;  movqi_insn
   ldi  r21, 0 ;  movqi_insn |
   eor  r24, r20   ;  *xorqi3|  eor r24, r20 ;  *xorqi3
   eor  r25, r21   ;  *xorqi3|
   lsl  r18;  *ashlhi3_const |  lsl  r18 ;  *ashlhi3_const
   rol  r19  |  rol  r19
   subi r22, 1 ;  *op8.for.cczn.p|  subi r22, 1  ; 
*op8.for.cczn.plus

   brne .L13   ;  branch_ZN  |  brne .L13;  branch_ZN
   ret   |  ret

   #define SPDR (*(uint8_t volatile*) 0x2c)

   void fn_PR49807 (long big)
   {
   SPDR = big >> 24;
   SPDR = big >> 16;
   SPDR = big >> 8;
   SPDR = big;
   }

   movw r20, r22   ;  *movhi |  movw r20, r22;  *movhi
   movw r22, r24   ;  *movhi |  movw r22, r24;  *movhi
   mov  r24, r23   ;  *ashrsi3_const |
   clr  r27  |
   sbrc r24,7|
   com  r27  |
   mov  r25, r27 |
   mov  r26, r27 |
   out  0xc, r24   ;  movqi_insn |  out 0xc, r23 ;  movqi_insn
   movw r24, r22   ;  *ashrsi3_const |
   clr  r27  |
   sbrc r25, 7   |
   com  r27  |
   mov  r26, r27 |
   out  0xc, r24   ;  movqi_insn |  out 0xc, r24 ;  movqi_insn
   clr  r27;  *ashrsi3_const |
   sbrc r23, 7   |
   dec  r27  |
   mov  r26, r23 |
   mov  r25, r22 |
   mov  r24, r21 |
   out  0xc, r24   ;  movqi_insn |  out 0xc, r21 ;  movqi_insn
   out  0xc, r20   ;  movqi_insn |  out 0xc, r20 ;  movqi_insn
   ret   |  ret

Johann

[PATCH 15/15] Alpha: Add option to avoid data races for partial writes

2024-11-17 Thread Maciej W. Rozycki

Similarly to data races with 8-bit byte or 16-bit word quantity memory 
writes on non-BWX Alpha implementations we have the same problem even on 
BWX implementations with partial memory writes produced for unaligned 
stores as well as block memory move and clear operations.  This happens 
at the boundaries of the area written where we produce unprotected RMW 
sequences, such as for example:

ldbu $1,0($3)
stw $31,8($3)
stq $1,0($3)

to zero a 9-byte member at the byte offset of 1 of a quadword-aligned 
struct, happily clobbering a 1-byte member at the beginning of said 
struct if concurrent write happens while executing on the same CPU such 
as in a signal handler or a parallel write happens while executing on 
another CPU such as in another thread or via a shared memory segment.

To guard against these data races with partial memory write accesses 
introduce the `-msafe-partial' command-line option that instructs the 
compiler to protect boundaries of the data quantity accessed by instead 
using a longer code sequence composed of narrower memory writes where 
suitable machine instructions are available (i.e. with BWX targets) or 
atomic RMW access sequences where byte and word memory access machine 
instructions are not available (i.e. with non-BWX targets).

Owing to the desire of branch avoidance there are redundant overlapping 
writes in unaligned cases where STQ_U operations are used in the middle 
of a block so as to make sure no part of data to be written has been 
lost regardless of run-time alignment.  For the non-BWX case it means 
that with blocks whose size is not a multiple of 8 there are additional 
atomic RMW sequences issued towards the end of the block in addition to 
the always required pair enclosing the block from each end.

Only one such additional atomic RMW sequence is actually required, but 
code currently issues two for the sake of simplicity.  An improvement 
might be added to `alpha_expand_unaligned_store_words_safe_partial' in 
the future, by folding `alpha_expand_unaligned_store_safe_partial' code 
for handling multi-word blocks whose size is not a multiple of 8 (i.e. 
with a trailing partial-word part).  It would improve performance a bit, 
but current code is correct regardless.

Add test cases accordingly.

There are notable regressions between a plain `-mno-bwx' configuration
and a `-mno-bwx -msafe-partial' one:

FAIL: gm2/iso/run/pass/strcons.mod execution,  -g
FAIL: gm2/iso/run/pass/strcons.mod execution,  -O
FAIL: gm2/iso/run/pass/strcons.mod execution,  -O -g
FAIL: gm2/iso/run/pass/strcons.mod execution,  -Os
FAIL: gm2/iso/run/pass/strcons.mod execution,  -O3 -fomit-frame-pointer
FAIL: gm2/iso/run/pass/strcons.mod execution,  -O3 -fomit-frame-pointer 
-finline-functions
FAIL: gm2/iso/run/pass/strcons4.mod execution,  -g
FAIL: gm2/iso/run/pass/strcons4.mod execution,  -O
FAIL: gm2/iso/run/pass/strcons4.mod execution,  -O -g
FAIL: gm2/iso/run/pass/strcons4.mod execution,  -Os
FAIL: gm2/iso/run/pass/strcons4.mod execution,  -O3 -fomit-frame-pointer
FAIL: gm2/iso/run/pass/strcons4.mod execution,  -O3 -fomit-frame-pointer 
-finline-functions

Just as with `-msafe-bwa' regressions they come from the fact that these 
test cases end up calling code that expects a reference to aligned data 
but is handed one to unaligned data, causing an alignment exception with 
LDL_L or LDQ_L, which will eventually be fixed up by Linux.

In some cases GCC chooses to open-code block memory write operations, so 
with non-BWX targets `-msafe-partial' will in the usual case have to be 
used together with `-msafe-bwa'.

gcc/
* config/alpha/alpha-protos.h 
(alpha_expand_unaligned_store_safe_partial): New prototype.
* config/alpha/alpha.cc (alpha_expand_movmisalign)
(alpha_expand_block_move, alpha_expand_block_clear): Handle 
TARGET_SAFE_PARTIAL.
(alpha_expand_unaligned_store_safe_partial)
(alpha_expand_unaligned_store_words_safe_partial)
(alpha_expand_clear_safe_partial_nobwx): New functions.
* config/alpha/alpha.md (insvmisaligndi): Handle 
TARGET_SAFE_PARTIAL.
* config/alpha/alpha.opt (msafe-partial): New option.
* config/alpha/alpha.opt.urls: Regenerate.
* doc/invoke.texi (Option Summary, DEC Alpha Options): Document
the new option.

gcc/testsuite/
* gcc.target/alpha/memcpy-di-unaligned-dst.c: New file.
* gcc.target/alpha/memcpy-di-unaligned-dst-safe-partial.c: New 
file.
* gcc.target/alpha/memcpy-di-unaligned-dst-safe-partial-bwx.c: 
New file.
* gcc.target/alpha/memcpy-si-unaligned-dst.c: New file.
* gcc.target/alpha/memcpy-si-unaligned-dst-safe-partial.c: New 
file.
* gcc.target/alpha/memcpy-si-unaligned-dst-safe-partial-bwx.c: 
New file.
---
 NB from my limited experience with Modula 2 decades ago I thought the 
language was strongly-typed, so an alignment mismatch

Re: [patch,avr] Adjust comment headers

2024-11-17 Thread Jan Dubiec


On 16.11.2024 13:19, Gerald Pfeifer wrote:
[...]

How should that be changed? (Simply drop the Atmel line?)
I am not sure what you mean, but I think "Atmel" should be replaced with 
"Microchip" because other devices have manufacturers listed.


https://www.microchip.com/pdf/mchp_to_acquire_atmel.pdf

Re: [PATCH] Add fancy pointer support in std::map/set

2024-11-17 Thread François Dumont


Hi

Here is a new proposal with all the cleanup regarding _Const_Base_ptr 
that makes support of allocator's fancy pointer type simpler.


Also submitted as PR:

https://forge.sourceware.org/gcc/gcc-TEST/pulls/27

   libstdc++: Add fancy pointer support in map and set

    Support fancy allocator pointer type in std::_Rb_tree<>.

    In case of fancy pointer type the container is now storing the 
pointer to

    _Rb_tree_pnode<> as a pointer to _Rb_tree_pnode_base<>.

    Many methods are adapted to take and return _Base_ptr in place of 
_Link_type

    which has been renamed into _Node_ptr.

    As all node are stored as _Base_ptr have all methods working with 
this type

    and remove _Const_Base_ptr and all methods associated to it.

    libstdc++-v3/ChangeLog:

    * include/bits/stl_set.h (std::set<>::upper_bound<>(const 
_Kt&) const): Fix

    decltype typo to use const_iterator.
    * include/bits/stl_tree.h
    (_Rb_tree_ptr_traits<>): New.
    (_Rb_tree_pnode_base<>): New.
    (_Rb_tree_node_base): Inherit from latter.
    (_Rb_tree_node_base::_Const_Base_ptr): Remove.
    (_Rb_tree_node_base::_S_minimum(_Const_Base_ptr)): Remove.
    (_Rb_tree_node_base::_S_maximum(_Const_Base_ptr)): Remove.
    (_Rb_tree_pheader): New.
    (_Rb_tree_header): Inherit from latter.
    (_Rb_tree_node_val): New.
    (_Rb_tree_node): Inherit from latter.
    (_Rb_tree_pnode): New.
    (_Rb_tree_iterator<>::_Link_type): Rename into...
    (_Rb_tree_iterator<>::_Node_ptr): ...this.
    (_Rb_tree_const_iterator<>::_Link_type): Rename into...
    (_Rb_tree_const_iterator<>::_Node_ptr): ...this.
    (_Rb_tree_const_iterator<>::_M_node): Change type into 
_Base_ptr.

    (_Rb_tree_const_iterator<>::_M_const_cast): Remove.
    (_Rb_tree_helpers<>): New.
    (_Rb_tree_piterator): New.
    (_Rb_tree_const_piterator): New.
    (_Rb_tree_node_traits<>): New.
    (_Rb_tree::_Node_base, _Rb_tree::_Node_type): New.
    (_Rb_tree<>::_Const_Base_ptr): Remove.
    (_Rb_tree): Adapt to generalize usage of _Base_ptr in place 
of _Link_type.

    (_Rb_tree<>::_M_mbegin): Remove.
    (_Rb_tree<>::_S_left(_Const_Base_ptr)): Remove.
    (_Rb_tree<>::_S_right(_Const_Base_ptr)): Remove.
    (_Rb_tree<>::_S_maximum(_Const_Base_ptr)): Remove.
    (_Rb_tree<>::_S_minimum(_Const_Base_ptr)): Remove.
    * testsuite/23_containers/map/allocator/ext_ptr.cc: New 
test case.
    * testsuite/23_containers/multimap/allocator/ext_ptr.cc: 
New test case.
    * testsuite/23_containers/multiset/allocator/ext_ptr.cc: 
New test case.
    * testsuite/23_containers/set/allocator/ext_ptr.cc: New 
test case.


Tested under Linux x64.


Note that I've also run the 23_containers tests on map, multimap, 
multiset and set tweaking implementation
so that new types are being used when C++11 or later even if allocator 
pointer type is a C pointer.


Ok to commit ?

François

On 12/11/2024 15:56, Jonathan Wakely wrote:

On Mon, 4 Nov 2024 at 21:34, François Dumont  wrote:


On 04/11/2024 19:45, Jonathan Wakely wrote:

On Mon, 4 Nov 2024 at 18:30, François Dumont  wrote:

On 21/10/2024 06:56, François Dumont wrote:


On 17/10/2024 23:11, Jonathan Wakely wrote:



On Thu, 17 Oct 2024 at 21:39, Jonathan Wakely  wrote:


On Thu, 17 Oct 2024 at 20:52, François Dumont  wrote:

Here is an updated version that compiles, I think, all your feedbacks. It's 
much cleaner indeed.

Thanks, I'll take a look tomorrow.


It's also tested in C++98/17/23.

I'm surprised that we do not need to consider potential 
allocator::const_pointer.

Do you mean consider the case where Alloc::const_pointer is not the same type 
as rebinding 'pointer' to a const element type?

Yes, exactly.



We don't need to consider that because we never get a 'const_pointer' from the 
allocator, and we never need to pass a 'const_pointer' to the allocator. The 
allocator's 'allocate' and 'deallocate' members both work with the 'pointer' type, so 
we only need to use that type when interacting with the allocator. For all the other 
uses, such as _Const_Node_ptr, what we need is a pointer-to-const that's compatible 
with the allocator's pointer type. It doesn't actually matter if it's the same type 
as allocator_traits::const_pointer, because we don't need

Sorry, I sent the email before finishing that thought!

... we don't need to pass a const_pointer to anything, we only need it for the 
container's own purposes.

But thinking about it some more, do we even need a const-pointer for the 
container?  Currently the const_iterator stores a const-pointer, and some 
members like _M_root() and _M_leftmost() return a const-pointer. But they don't 
need to. The nodes are all pointed to by a non-const _Base_ptr, none of the 
storage manage

Re: [r15-5336 Regression] FAIL: gcc.dg/guality/pr36728-4.c -Os -DPREVENT_OPTIMIZATION line 16 y == 2 on Linux/x86_64

2024-11-17 Thread Jan Hubicka

> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.

[pushed] libstdc++: Update link to Angelika Langer's book

2024-11-17 Thread Gerald Pfeifer

Pushed.

Gerald


libstdc++-v3:
* doc/xml/manual/io.xml: Update link to Angelika Langer's book.
* doc/html/manual/streambufs.html: Regenerate.
---
 libstdc++-v3/doc/html/manual/streambufs.html | 2 +-
 libstdc++-v3/doc/xml/manual/io.xml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/doc/html/manual/streambufs.html 
b/libstdc++-v3/doc/html/manual/streambufs.html
index d3c592177f0..e299d70cb0f 100644
--- a/libstdc++-v3/doc/html/manual/streambufs.html
+++ b/libstdc++-v3/doc/html/manual/streambufs.html
@@ -6,7 +6,7 @@
 Creating your own stream buffers for I/O can be remarkably easy.
   If you are interested in doing so, we highly recommend two very
   excellent books:
-  http://www.angelikalanger.com/iostreams.html"; 
target="_top">Standard C++
+  https://angelikalanger.com/iostreams.html"; 
target="_top">Standard C++
   IOStreams and Locales by Langer and Kreft, ISBN 0-201-18395-1, and
   http://www.josuttis.com/libbook/"; 
target="_top">The C++ Standard Library
   by Nicolai Josuttis, ISBN 0-201-37926-0.  Both are published by
diff --git a/libstdc++-v3/doc/xml/manual/io.xml 
b/libstdc++-v3/doc/xml/manual/io.xml
index 8380683561a..d6fe2ce841d 100644
--- a/libstdc++-v3/doc/xml/manual/io.xml
+++ b/libstdc++-v3/doc/xml/manual/io.xml
@@ -190,7 +190,7 @@
Creating your own stream buffers for I/O can be remarkably easy.
   If you are interested in doing so, we highly recommend two very
   excellent books:
-  http://www.w3.org/1999/xlink"; 
xlink:href="http://www.angelikalanger.com/iostreams.html";>Standard C++
+  http://www.w3.org/1999/xlink"; 
xlink:href="https://angelikalanger.com/iostreams.html";>Standard C++
   IOStreams and Locales by Langer and Kreft, ISBN 0-201-18395-1, and
   http://www.w3.org/1999/xlink"; 
xlink:href="http://www.josuttis.com/libbook/";>The C++ Standard Library
   by Nicolai Josuttis, ISBN 0-201-37926-0.  Both are published by
-- 
2.47.0

[PATCH 0/4] Improve and add VLS slide strategies.

2024-11-17 Thread Robin Dapp

From: Robin Dapp 

This small series adds slide, interleave, and even/odd permute strategies
as well as an improved slide1up pattern.

A note: Right now the slide tests as well as the even/odd run tests fail.
This is due to two separate bugs, one in varasm and one in vsetvl-avlprop.

In varasm we don't handle our vector mask modes correctly:
For even/odd we use compress instructions whose different masks start
with the same pattern (1 0 1 0) but have different sizes.
As the RTX hash uses the wrong size for riscv's mask modes it only
looks at the first parts.  Thus, we consider the entire masks similar
and erroneously unify them in the constant pool.
I have a working fix for this but it still has minor fallout on our test
server.

The avlprop issue I haven't debugged further but the tests stop failing
once I disable the avlprop pass.

I'm going to post separate patches for both issues in stage 3 but still
wanted to get this series "out of the door" first.

Regtested on rv64gcv.

Robin Dapp (4):
  RISC-V: Add slide to perm_const strategies.
  RISC-V: Add interleave pattern.
  RISC-V: Add even/odd vec_perm_const pattern.
  RISC-V: Improve slide1up pattern.

 gcc/config/riscv/riscv-protos.h   |   1 +
 gcc/config/riscv/riscv-v.cc   | 287 +-
 gcc/config/riscv/riscv.cc |  18 +-
 .../gcc.target/riscv/rvv/autovec/pr112599-2.c |   2 +-
 .../autovec/vls-vlmax/shuffle-evenodd-run.c   | 123 
 .../rvv/autovec/vls-vlmax/shuffle-evenodd.c   |  68 +
 .../vls-vlmax/shuffle-interleave-run.c| 124 
 .../autovec/vls-vlmax/shuffle-interleave.c|  69 +
 .../autovec/vls-vlmax/shuffle-slide-run1.c|  81 +
 .../autovec/vls-vlmax/shuffle-slide-run2.c| 271 +
 .../rvv/autovec/vls-vlmax/shuffle-slide1.c| 137 +
 .../rvv/autovec/vls-vlmax/shuffle-slide2.c| 207 +
 12 files changed, 1376 insertions(+), 12 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide2.c

-- 
2.47.0

[PATCH v4 0/8] Add maskload else operand.

2024-11-17 Thread Robin Dapp

From: Robin Dapp 

Hi,

changes from v3:

- Check if we support vec_cond_expr for the selected mode in case we
  need to set the inactive elements to zero.
- Add another undef operand to gcn.
- Remove unnecessary changes in i386 patch.

Robin Dapp (8):
  docs: Document maskload else operand and behavior.
  ifn: Add else-operand handling.
  tree-ifcvt: Add zero maskload else value.
  vect: Add maskload else value support.
  aarch64: Add masked-load else operands.
  gcn: Add else operand to masked loads.
  i386: Add zero maskload else operand.
  RISC-V: Add else operand to masked loads [PR115336].

 .../aarch64/aarch64-sve-builtins-base.cc  |  24 +-
 gcc/config/aarch64/aarch64-sve-builtins.cc|  12 +-
 gcc/config/aarch64/aarch64-sve-builtins.h |   2 +-
 gcc/config/aarch64/aarch64-sve.md |  52 ++-
 gcc/config/aarch64/aarch64-sve2.md|   3 +-
 gcc/config/aarch64/iterators.md   |   4 -
 gcc/config/aarch64/predicates.md  |   4 +
 gcc/config/gcn/gcn-valu.md|  23 +-
 gcc/config/gcn/predicates.md  |   2 +
 gcc/config/i386/sse.md|  21 +-
 gcc/config/riscv/autovec.md   |  50 +--
 gcc/config/riscv/predicates.md|   3 +
 gcc/config/riscv/riscv-v.cc   |  30 +-
 gcc/doc/md.texi   |  63 ++--
 gcc/internal-fn.cc| 148 ++--
 gcc/internal-fn.h |  13 +-
 gcc/optabs-query.cc   |  70 +++-
 gcc/optabs-query.h|   3 +-
 gcc/optabs-tree.cc|  66 +++-
 gcc/optabs-tree.h |   8 +-
 .../gcc.target/riscv/rvv/autovec/pr115336.c   |  20 ++
 .../gcc.target/riscv/rvv/autovec/pr116059.c   |  15 +
 gcc/tree-if-conv.cc   |  12 +-
 gcc/tree-vect-data-refs.cc|  74 ++--
 gcc/tree-vect-patterns.cc |  12 +-
 gcc/tree-vect-slp.cc  |  25 +-
 gcc/tree-vect-stmts.cc| 326 +++---
 gcc/tree-vectorizer.h |  10 +-
 28 files changed, 854 insertions(+), 241 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr115336.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116059.c

-- 
2.47.0

[pushed] c++: -M and modules again

2024-11-17 Thread Jason Merrill

Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

While experimenting with testing module std I noticed that gcc -M broke on
it; it seems I need to set directives_only even sooner than I did in
r15-4219.

gcc/c-family/ChangeLog:

* c-ppoutput.cc (preprocess_file): Don't set directives_only here.

gcc/cp/ChangeLog:

* module.cc (module_preprocess_options): Set directives_only here.
---
 gcc/c-family/c-ppoutput.cc | 1 -
 gcc/cp/module.cc   | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/c-family/c-ppoutput.cc b/gcc/c-family/c-ppoutput.cc
index f6f83bdff00..97ea864df14 100644
--- a/gcc/c-family/c-ppoutput.cc
+++ b/gcc/c-family/c-ppoutput.cc
@@ -95,7 +95,6 @@ preprocess_file (cpp_reader *pfile)
   if (flag_modules)
{
  /* For macros from imported headers we need directives_only_cb.  */
- cpp_get_options (pfile)->directives_only = true;
  scan_translation_unit_directives_only (pfile);
}
   else
diff --git a/gcc/cp/module.cc b/gcc/cp/module.cc
index 27eb39b48fa..4136fdceb9f 100644
--- a/gcc/cp/module.cc
+++ b/gcc/cp/module.cc
@@ -21224,6 +21224,8 @@ module_preprocess_options (cpp_reader *reader)
}
   auto *opt = cpp_get_options (reader);
   opt->module_directives = true;
+  if (flag_no_output)
+   opt->directives_only = true;
   if (opt->main_search == CMS_none)
opt->main_search = cpp_main_search (flag_header_unit);
 }

base-commit: bd59f2eeacd41b91e4e79b32dda83cc60d499e25
prerequisite-patch-id: 1d5f14b39e65d5cab453a0381f695e1a43547123
prerequisite-patch-id: e310dd58cbb3709f815844319766be6cb6888a47
-- 
2.47.0

Re: [committed] c: Default to -std=gnu23

2024-11-17 Thread Gerald Pfeifer

On Sat, 16 Nov 2024, Andrew Pinski wrote:
>> I started seeing the following on x86_64-unknown-freebsd13.3 over night:
> Submitted https://gcc.gnu.org/pipermail/gcc-patches/2024-November/669117.html
> to fix those warnings. The code was already partly ANSIfied even.

Thank you - happy to confirm this addresses that build for me.

Gerald

[pushed] libstdc++: Move a gcc.gnu.org link to https

2024-11-17 Thread Gerald Pfeifer

libstdc++-v3:
* doc/xml/manual/intro.xml: Move a gcc.gnu.org link to https.
* doc/html/manual/license.html: Regenerate.
---
 libstdc++-v3/doc/xml/manual/intro.xml | 2 +-
 libstdc++-v3/doc/html/manual/license.html | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/doc/xml/manual/intro.xml 
b/libstdc++-v3/doc/xml/manual/intro.xml
index b940e9cfa90..ed0b90d202b 100644
--- a/libstdc++-v3/doc/xml/manual/intro.xml
+++ b/libstdc++-v3/doc/xml/manual/intro.xml
@@ -75,7 +75,7 @@
 
 
   There is a license section in the FAQ regarding common questions. If you have more
-  questions, ask the FSF or the http://www.w3.org/1999/xlink"; 
xlink:href="http://gcc.gnu.org/lists.html";>gcc mailing list.
+  questions, ask the FSF or the http://www.w3.org/1999/xlink"; 
xlink:href="https://gcc.gnu.org/lists.html";>gcc mailing list.
 
 
 The Code: GPL
diff --git a/libstdc++-v3/doc/html/manual/license.html 
b/libstdc++-v3/doc/html/manual/license.html
index e10808a8908..f6a9e6ce6dd 100644
--- a/libstdc++-v3/doc/html/manual/license.html
+++ b/libstdc++-v3/doc/html/manual/license.html
@@ -4,7 +4,7 @@
 and one for the documentation.
 
   There is a license section in the FAQ regarding common questions. If you have more
-  questions, ask the FSF or the http://gcc.gnu.org/lists.html"; target="_top">gcc mailing list.
+  questions, ask the FSF or the https://gcc.gnu.org/lists.html"; target="_top">gcc mailing list.
 The Code: 
GPL
   The source code is distributed under the GNU General Public License version 3,
   with the addition under section 7 of an exception described in
-- 
2.47.0

Re: [PATCH 1/3] ipa-strub: Replace cgraph_node order with uid.

2024-11-17 Thread Jan Hubicka

> ipa_strub_set_mode_for_new_functions uses node order as unique ever
> increasing identifier. This is better satisfied with uid.
> Order loses uniqueness with following patches.
> 
> gcc/ChangeLog:
>   * ipa-strub.cc (ipa_strub_set_mode_for_new_functions): Replace
> order with uid.
>   (pass_ipa_strub_mode::execute): Likewise.
OK,
thanks!

Honza

[PATCH V1] RISC-V: Add the mini support for SiFive extensions.

2024-11-17 Thread shiyulong

From: yulong 

This patch add the mini support for xsfvqmaccqoq, xsfvqmaccdod and
 xsfvfnrclipxfqf extensions.

gcc/ChangeLog:

* common/config/riscv/riscv-common.cc: New.
* config/riscv/riscv.opt: New.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/predef-sf-3.c: New test.
* gcc.target/riscv/predef-sf-4.c: New test.
* gcc.target/riscv/predef-sf-5.c: New test.

---
 gcc/common/config/riscv/riscv-common.cc  |  6 ++
 gcc/config/riscv/riscv.opt   |  6 ++
 gcc/testsuite/gcc.target/riscv/predef-sf-3.c | 14 ++
 gcc/testsuite/gcc.target/riscv/predef-sf-4.c | 14 ++
 gcc/testsuite/gcc.target/riscv/predef-sf-5.c | 14 ++
 5 files changed, 54 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/predef-sf-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/predef-sf-4.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/predef-sf-5.c

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index b0e49eb82c0..49e8a41846e 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -430,6 +430,9 @@ static const struct riscv_ext_version 
riscv_ext_version_table[] =
 
   {"xsfvcp",   ISA_SPEC_CLASS_NONE, 1, 0},
   {"xsfcease", ISA_SPEC_CLASS_NONE, 1, 0},
+  {"xsfvqmaccqoq",ISA_SPEC_CLASS_NONE, 1, 0},
+  {"xsfvqmaccdod",ISA_SPEC_CLASS_NONE, 1, 0},
+  {"xsfvfnrclipxfqf", ISA_SPEC_CLASS_NONE, 1, 0},
 
   /* Terminate the list.  */
   {NULL, ISA_SPEC_CLASS_NONE, 0, 0}
@@ -1759,6 +1762,9 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =
 
   RISCV_EXT_FLAG_ENTRY ("xsfvcp",   x_riscv_sifive_subext, MASK_XSFVCP),
   RISCV_EXT_FLAG_ENTRY ("xsfcease", x_riscv_sifive_subext, MASK_XSFCEASE),
+  RISCV_EXT_FLAG_ENTRY ("xsfvqmaccqoq",x_riscv_sifive_subext, 
MASK_XSFVQMACCQOQ),
+  RISCV_EXT_FLAG_ENTRY ("xsfvqmaccdod",x_riscv_sifive_subext, 
MASK_XSFVQMACCDOD),
+  RISCV_EXT_FLAG_ENTRY ("xsfvfnrclipxfqf", x_riscv_sifive_subext, 
MASK_XSFVFNRCLIPXFQF),
 
   {NULL, NULL, NULL, 0}
 };
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index ab9d6e82723..d7fa47f7080 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -523,6 +523,12 @@ Mask(XSFVCP) Var(riscv_sifive_subext)
 
 Mask(XSFCEASE) Var(riscv_sifive_subext)
 
+Mask(XSFVQMACCQOQ) Var(riscv_sifive_subext)
+
+Mask(XSFVQMACCDOD) Var(riscv_sifive_subext)
+
+Mask(XSFVFNRCLIPXFQF) Var(riscv_sifive_subext)
+
 TargetVariable
 int riscv_fmv_priority = 0
 
diff --git a/gcc/testsuite/gcc.target/riscv/predef-sf-3.c 
b/gcc/testsuite/gcc.target/riscv/predef-sf-3.c
new file mode 100644
index 000..0f3fbfd6907
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/predef-sf-3.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64g_xsfvqmaccqoq -mabi=lp64" } */
+
+int main () {
+#if !defined(__riscv)
+#error "__riscv"
+#endif
+
+#if !defined(__riscv_xsfvqmaccqoq)
+#error "__riscv_xsfvqmaccqoq"
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/predef-sf-4.c 
b/gcc/testsuite/gcc.target/riscv/predef-sf-4.c
new file mode 100644
index 000..9df0799313f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/predef-sf-4.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64g_xsfvqmaccdod -mabi=lp64" } */
+
+int main () {
+#if !defined(__riscv)
+#error "__riscv"
+#endif
+
+#if !defined(__riscv_xsfvqmaccdod)
+#error "__riscv_xsfvqmaccdod"
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/predef-sf-5.c 
b/gcc/testsuite/gcc.target/riscv/predef-sf-5.c
new file mode 100644
index 000..aeaf708f4e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/predef-sf-5.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64g_xsfvfnrclipxfqf -mabi=lp64" } */
+
+int main () {
+#if !defined(__riscv)
+#error "__riscv"
+#endif
+
+#if !defined(__riscv_xsfvfnrclipxfqf)
+#error "__riscv_xsfvfnrclipxfqf"
+#endif
+
+  return 0;
+}
-- 
2.34.1

[PATCH v2 11/14] Support for 64-bit location_t: RTL parts

2024-11-17 Thread Lewis Hyatt

Some RTL objects need to store a location_t. Currently, they store it in the
rt_int field of union rtunion, but in a world where location_t could be
64-bit, they need to store it in a larger variable. Unfortunately, rtunion
does not currently have a 64-bit int type for that purpose, so add one. In
order to avoid increasing any overhead when 64-bit locations are not in use,
the new field is dedicated for location_t storage only and has type
"location_t" so it will only be 64-bit if necessary. This necessitates
adding a new RTX format code 'L' for locations. There are very many switch
statements in the codebase that inspect the RTX format code. I took the
approach of finding all of them that handle code 'i' or 'n' and making sure
they handle 'L' too. I am sure that some of these call sites can never see
an 'L' code, but I thought it would be safer and more future-proof to handle
as many as possible, given it's just a line or two to add in most cases.

While testing this with --enable-checking=rtl, I came across one place in
final.cc that seems to be a (currently) harmless misuse of RTL:

set_cur_block_to_this_block:
  if (! this_block)
{
  if (INSN_LOCATION (insn) == UNKNOWN_LOCATION)
continue;
  else
this_block = DECL_INITIAL (cfun->decl);
}

In this part of reemit_insn_block_notes(), the insn variable could actually
be a NOTE and not an INSN. In that case, INSN_LOCATION() shouldn't be
called on it. It works fine currently because the field is properly accessed
by XINT() either way. (For an INSN, it is a location, but for a NOTE, it is
the note type enum). Currently, if insn is a NOTE, the comparison must
always be false because the note type is not equal to
0==UNKNOWN_LOCATION. Once locations and ints are differentiated, this line
leads to a checking failure, which I resolved by checking for the NOTE_P
case before calling INSN_LOCATION.

gcc/ChangeLog:

* rtl.def (DEBUG_INSN): Use new format code 'L' for location_t fields.
(INSN): Likewise.
(JUMP_INSN): Likewise.
(CALL_INSN): Likewise.
(ASM_INPUT): Likewise.
(ASM_OPERANDS): Likewise.
* rtl.h (union rtunion): Add new location_t RT_LOC member for use by
the 'L' format.
(struct rtx_debug_insn): Adjust comment.
(struct rtx_nonjump_insn): Adjust comment.
(struct rtx_call_insn): Adjust comment.
(XLOC): New accessor macro for rtunion::rt_loc.
(X0LOC): Likewise.
(XCLOC): Likewise.
(INSN_LOCATION): Use XLOC instead of XUINT to retrieve a location_t.
(NOTE_MARKER_LOCATION): Likewise for XCUINT -> XCLOC.
(ASM_OPERANDS_SOURCE_LOCATION): Likewise.
(ASM_INPUT_SOURCE_LOCATION):Likewise.
(gen_rtx_ASM_INPUT): Adjust to use sL format instead of si.
(gen_rtx_INSN): Adjust prototype to use location_r rather than int
for the location.
* cfgrtl.cc (force_nonfallthru_and_redirect): Change type of LOC
local variable from int to location_t.
* rtlhash.cc (add_rtx): Support 'L' format in the switch statement.
* var-tracking.cc (loc_cmp): Likewise.
* alias.cc (rtx_equal_for_memref_p): Likewise.
* config/alpha/alpha.cc (summarize_insn): Likewise.
* config/ia64/ia64.cc (rtx_needs_barrier): Likewise.
* config/rs6000/rs6000.cc (rs6000_hash_constant): Likewise.
* cse.cc (hash_rtx): Likewise.
(exp_equiv_p): Likewise.
* cselib.cc (rtx_equal_for_cselib_1): Likewise.
(cselib_hash_rtx): Likewise.
(cselib_expand_value_rtx_1): Likewise.
* emit-rtl.cc (copy_insn_1): Likewise.
(gen_rtx_INSN): Change the location argument from int to location_t,
and call the corresponding gen_rtf_fmt_* function.
* final.cc (reemit_insn_block_notes): Don't call INSN_LOCATION if
NOTE_P; the field being accessed is not a location in this case.
(leaf_renumber_regs_insn): Support 'L' format in the switch statement.
* genattrtab.cc (attr_rtx_1): Likewise.
* genemit.cc (gen_exp): Likewise.
* gengenrtl.cc (type_from_format): Likewise.
(accessor_from_format): Likewise.
* gengtype.cc (adjust_field_rtx_def): Likewise.
* genpeep.cc (match_rtx): Likewise; just mark gcc_unreachable() for
now.
* genrecog.cc (find_operand): Support 'L' format in the switch 
statement.
(find_matching_operand): Likewise.
(validate_pattern): Likewise.
* gensupport.cc (subst_pattern_match): Likewise.
(get_alternatives_number): Likewise.
(collect_insn_data): Likewise.
(alter_predicate_for_insn): Likewise.
(alter_constraints): Likewise.
(subst_dup): Likewise.
* jump.cc (rtx_renumbered_equal_p): Likewise.
* loop-invariant.cc (hash_invariant_expr_1): Likewise.
* lra-constraints.cc (operands_match_p): Likewise.
* lra.cc (l

[PATCH 2/4] RISC-V: Add interleave pattern.

2024-11-17 Thread Robin Dapp

From: Robin Dapp 

This patch adds efficient handling of interleaving patterns like
[0 4 1 5] to vec_perm_const.  It is implemented by a slideup and a
gather.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (shuffle_interleave_patterns): New
function.
(expand_vec_perm_const_1): Use new function.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c: New 
test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c: New test.
---
 gcc/config/riscv/riscv-v.cc   |  80 
 .../vls-vlmax/shuffle-interleave-run.c| 122 ++
 .../autovec/vls-vlmax/shuffle-interleave.c|  69 ++
 3 files changed, 271 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b5c4a5037c0..d940b961bf8 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3474,6 +3474,84 @@ shuffle_slide_patterns (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize interleaving patterns like [0 4 1 5].  */
+
+static bool
+shuffle_interleave_patterns (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  machine_mode sel_mode = related_int_vector_mode (vmode).require ();
+  poly_int64 vec_len = d->perm.length ();
+  int n_patterns = d->perm.encoding ().npatterns ();
+
+  if (!vec_len.is_constant ())
+return false;
+
+  if (n_patterns != 2)
+return false;
+
+  int vlen = vec_len.to_constant ();
+
+  if (vlen < 4 || vlen > 64)
+return false;
+
+  if (d->one_vector_p)
+return false;
+
+  bool low = true;
+  if (d->perm.series_p (0, 2, 0, 1)
+  && d->perm.series_p (1, 2, vlen, 1))
+low = true;
+  else if (d->perm.series_p (0, 2, vlen / 2, 1)
+  && d->perm.series_p (1, 2, vlen + vlen / 2, 1))
+low = false;
+  else
+return false;
+
+  vec_perm_builder sel (vlen, 2, 1);
+  sel.safe_grow (vlen);
+  int cnt = 0;
+  for (int i = 0; i < vlen; i += 2)
+{
+  sel[i] = cnt;
+  sel[i + 1] = cnt + vlen / 2;
+  cnt++;
+}
+
+  vec_perm_indices indices (sel, 2, vlen);
+
+  if (vlen != (int)indices.length ().to_constant ())
+return false;
+
+  /* Success!  */
+  if (d->testing_p)
+return true;
+
+  int slide_cnt = vlen / 2;
+  rtx tmp = gen_reg_rtx (vmode);
+
+  if (low)
+{
+  /* No need for a vector length because we slide up until the
+end of OP1 anyway.  */
+  rtx ops[] = {tmp, d->op0, d->op1, gen_int_mode (slide_cnt, Pmode)};
+  insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
+  emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
+}
+  else
+{
+  rtx ops[] = {tmp, d->op1, d->op0, gen_int_mode (slide_cnt, Pmode)};
+  insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, vmode);
+  emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops,
+ gen_int_mode (slide_cnt, Pmode));
+}
+
+  rtx sel_rtx = vec_perm_indices_to_rtx (sel_mode, indices);
+  emit_vlmax_gather_insn (gen_lowpart (vmode, d->target), tmp, sel_rtx);
+
+  return true;
+}
+
 /* Recognize decompress patterns:
 
1. VEC_PERM_EXPR op0 and op1
@@ -3790,6 +3868,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
  if (shuffle_slide_patterns (d))
return true;
+ if (shuffle_interleave_patterns (d))
+   return true;
  if (shuffle_compress_patterns (d))
return true;
  if (shuffle_decompress_patterns (d))
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c
new file mode 100644
index 000..57748d95362
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c
@@ -0,0 +1,122 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target riscv_v_ok } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m8 -std=gnu99" } */
+
+#include "shuffle-interleave.c"
+
+#define SERIES_2(x, y) (x), (x + 1)
+#define SERIES_4(x, y) SERIES_2 (x, y), SERIES_2 (x + 2, y)
+#define SERIES_8(x, y) SERIES_4 (x, y), SERIES_4 (x + 4, y)
+#define SERIES_16(x, y) SERIES_8 (x, y), SERIES_8 (x + 8, y)
+#define SERIES_32(x, y) SERIES_16 (x, y), SERIES_16 (x + 16, y)
+#define SERIES_64(x, y) SERIES_32 (x, y), SERIES_32 (x + 32, y)
+
+#define comp(a, b, n)  
\
+  for (unsigned i = 0; i < n; ++i) 
\
+if ((a)[i] != (b)[i])  
\
+  __builtin_abort ();
+
+#define CHECK1(TYPE, NUNITS)   
\
+  __attribute_

[PATCH v2 09/14] Support for 64-bit location_t: C++ modules parts

2024-11-17 Thread Lewis Hyatt

Note: This patch was acked in v1
(https://gcc.gnu.org/pipermail/gcc-patches/2024-November/667608.html).
The only change from that version is that
#ifdef ENABLE_LARGE_SOURCE_LOCATIONS
has been changed to
if (sizeof (location_t) > sizeof (unsigned))
, because it was decided to remove the configure option. The rest of the
patch is the same as before. Thanks!

-Lewis

-- >8 --

The modules implementation is necessarily sensitive to the internal workings
of class line_map, and so it needed changes in order to handle a 64-bit
location_t. The changes mostly boil down to supporting that in the debug
dumping routines (which is accomplished by using a new custom code %K for
that purpose), and supporting that when streaming in and out from the
module (which is accomplished by using a new loc() function to go along with
existing abstractions like u() or z() for streaming in and out different
data types).

gcc/cp/ChangeLog:

* module.cc (bytes_out::loc): New function.
(bytes_in::loc): New function.
(struct span): Change int fields to location_diff_t.
(range_t): Change from "unsigned int" to "line_map_uint_t".
(struct ord_loc_info): Likewise.
(struct macro_loc_info): Likewise.
(class module_state): Likewise.
(dumper::operator()): Add new code 'K' for dumping a location_t.
(loc_spans::init): Use %K instead of %u for location_t dumps.
(loc_spans::open): Likewise.
(loc_spans::close): Likewise. Adjust bitwise expressions to support
64-bit location_t as well.
(struct module_state_config): Change ordinary_locs and macro_locs
from "unsigned int" to "line_map_uint_t".  Reorder fields to improve
packing.  Rather than changing the constructor initializer list to
match the new order, switch to NSDMI instead.
(module_state::note_location): Adjust to support 64-bit location_t.
(module_state::write_location): Use %K instead of %u for location_t
dumps. Use loc() instead of u() for streaming location_t.
(module_state::read_location): Likewise.
(module_state::write_ordinary_maps): Likewise.
(module_state::write_macro_maps): Likewise.
(module_state::write_config): Likewise.
(module_state::read_config): Likewise.
(module_state::write_prepare_maps): Use %K instead of %u for
location_t dumps. Adjust variable types and bitwise expressions to
support 64-bit location_t.
(module_state::read_ordinary_maps): Likewise.
(module_state::read_macro_maps): Likewise.
(preprocess_module): Adjust data types to support 64-bit number of
line maps.
---
 gcc/cp/module.cc | 227 ++-
 1 file changed, 128 insertions(+), 99 deletions(-)

diff --git a/gcc/cp/module.cc b/gcc/cp/module.cc
index fe9cdd9bc24..6bc83c6cba6 100644
--- a/gcc/cp/module.cc
+++ b/gcc/cp/module.cc
@@ -350,6 +350,9 @@ typedef hash_map 
ptr_int_hash_map;
 /* Variable length buffer.  */
 
 namespace {
+
+constexpr line_map_uint_t loc_one = 1;
+
 class data {
 public:
   class allocator {
@@ -549,6 +552,7 @@ public:
   int i ();/* Read a signed int.  */
   unsigned u ();   /* Read an unsigned int.  */
   size_t z (); /* Read a size_t.  */
+  location_t loc ();/* Read a location_t.  */
   HOST_WIDE_INT wi ();  /* Read a HOST_WIDE_INT.  */
   unsigned HOST_WIDE_INT wu (); /* Read an unsigned HOST_WIDE_INT.  */
   const char *str (size_t * = NULL); /* Read a string.  */
@@ -633,6 +637,7 @@ public:
   void i (int);/* Write signed int.  */
   void u (unsigned);   /* Write unsigned int.  */
   void z (size_t s);   /* Write size_t.  */
+  void loc (location_t); /* Write location_t.  */
   void wi (HOST_WIDE_INT); /* Write HOST_WIDE_INT.  */
   void wu (unsigned HOST_WIDE_INT);  /* Write unsigned HOST_WIDE_INT.  */
   void str (const char *ptr)
@@ -1057,6 +1062,24 @@ bytes_in::z ()
 return wu ();
 }
 
+/* location_t written as 32- or 64-bit as needed.  */
+
+inline void bytes_out::loc (location_t l)
+{
+  if (sizeof (location_t) > sizeof (unsigned))
+wu (l);
+  else
+u (l);
+}
+
+inline location_t bytes_in::loc ()
+{
+  if (sizeof (location_t) > sizeof (unsigned))
+return wu ();
+  else
+return u ();
+}
+
 /* Buffer simply memcpied.  */
 void *
 bytes_out::buf (size_t len)
@@ -3210,7 +3233,7 @@ trees_out::~trees_out ()
 
 
 /* I use half-open [first,second) ranges.  */
-typedef std::pair range_t;
+typedef std::pair range_t;
 
 /* A range of locations.  */
 typedef std::pair loc_range_t;
@@ -3227,8 +3250,9 @@ public:
   struct span {
 loc_range_t ordinary;  /* Ordinary map location range. */
 loc_range_t macro; /* Macro map location range.  */
-int ordinary_delta;/* Add to ordinary loc to get serialized loc.  
*/
-int macro_delta;   /* Likewise for macro loc.  */
+/* Add to locs to get serialized loc.  */
+

[PATCH 2/2] Flatten anonymous structs in CodeView types

2024-11-17 Thread Mark Harmstone

If a CodeView struct, class, or union has as a member an anonymous
struct, class, or union, this gets flattened. The sub-struct's members
will appear as if they were part of their parent.

For this, we move part of get_type_num_struct into a new function
add_to_fieldlist, which also handles creating an LF_INDEX overflow item
if an LF_FIELDLIST grows too large. This is because add_struct_member
now calls itself recursively, and so needs to handle overflows itself.

gcc/
* dwarf2codeview.cc (add_to_fieldlist): New function.
(add_struct_member): Call recursively to flatten structs, and call
add_to_fieldlist.
(add_struct_static_member): Call add_to_fieldlist.
(add_struct_function): Call add_to_fieldlist.
(add_struct_inheritance): Call add_to_fieldlist.
(add_struct_nested_type): Call add_to_fieldlist.
(get_type_num_struct): Move code to add_to_fieldlist, and move
responsibility for this to subfunctions.
---
 gcc/dwarf2codeview.cc | 280 +-
 1 file changed, 167 insertions(+), 113 deletions(-)

diff --git a/gcc/dwarf2codeview.cc b/gcc/dwarf2codeview.cc
index 261fcea6a97..59380335991 100644
--- a/gcc/dwarf2codeview.cc
+++ b/gcc/dwarf2codeview.cc
@@ -5901,6 +5901,52 @@ add_struct_forward_def (dw_die_ref type)
   return ct->num;
 }
 
+/* Add a new subtype to an LF_FIELDLIST type, and handle overflows if
+   necessary.  */
+
+static void
+add_to_fieldlist (codeview_custom_type **ct, uint16_t *num_members,
+ codeview_subtype *el, size_t el_len)
+{
+  /* Add an LF_INDEX subtype if everything's too big for one
+ LF_FIELDLIST.  */
+
+  if ((*ct)->lf_fieldlist.length + el_len > MAX_FIELDLIST_SIZE)
+{
+  codeview_subtype *idx;
+  codeview_custom_type *ct2;
+
+  idx = (codeview_subtype *) xmalloc (sizeof (*idx));
+  idx->next = NULL;
+  idx->kind = LF_INDEX;
+  idx->lf_index.type_num = 0;
+
+  (*ct)->lf_fieldlist.last_subtype->next = idx;
+  (*ct)->lf_fieldlist.last_subtype = idx;
+
+  ct2 = (codeview_custom_type *)
+   xmalloc (sizeof (codeview_custom_type));
+
+  ct2->next = *ct;
+  ct2->kind = LF_FIELDLIST;
+  ct2->lf_fieldlist.length = 0;
+  ct2->lf_fieldlist.subtypes = NULL;
+  ct2->lf_fieldlist.last_subtype = NULL;
+
+  *ct = ct2;
+}
+
+  (*ct)->lf_fieldlist.length += el_len;
+
+  if ((*ct)->lf_fieldlist.last_subtype)
+(*ct)->lf_fieldlist.last_subtype->next = el;
+  else
+(*ct)->lf_fieldlist.subtypes = el;
+
+  (*ct)->lf_fieldlist.last_subtype = el;
+  (*num_members)++;
+}
+
 /* Add an LF_BITFIELD type, returning its number.  DWARF represents bitfields
as members in a struct with a DW_AT_data_bit_offset attribute, whereas in
CodeView they're a distinct type.  */
@@ -5933,36 +5979,69 @@ create_bitfield (dw_die_ref c)
 
 static void
 add_struct_member (dw_die_ref c, uint16_t accessibility,
-  codeview_subtype **el, size_t *el_len)
+  codeview_custom_type **ct, uint16_t *num_members,
+  unsigned int base_offset)
 {
-  *el = (codeview_subtype *) xmalloc (sizeof (**el));
-  (*el)->next = NULL;
-  (*el)->kind = LF_MEMBER;
-  (*el)->lf_member.attributes = accessibility;
+  codeview_subtype *el;
+  size_t el_len;
+  dw_die_ref type = get_AT_ref (c, DW_AT_type);
+  unsigned int offset;
+
+  offset = base_offset + get_AT_unsigned (c, DW_AT_data_member_location);
+
+  /* If the data member is actually an anonymous struct, class, or union,
+ follow MSVC by flattening this into its parent.  */
+  if (!get_AT_string (c, DW_AT_name) && type
+  && (dw_get_die_tag (type) == DW_TAG_structure_type
+  || dw_get_die_tag (type) == DW_TAG_class_type
+  || dw_get_die_tag (type) == DW_TAG_union_type))
+{
+  dw_die_ref c2, first_child;
+
+  first_child = dw_get_die_child (type);
+  c2 = first_child;
+
+  do
+   {
+ c2 = dw_get_die_sib (c2);
+
+ if (dw_get_die_tag (c2) == DW_TAG_member)
+ add_struct_member (c2, accessibility, ct, num_members, offset);
+   }
+  while (c2 != first_child);
+
+  return;
+}
+
+  el = (codeview_subtype *) xmalloc (sizeof (*el));
+  el->next = NULL;
+  el->kind = LF_MEMBER;
+  el->lf_member.attributes = accessibility;
 
   if (get_AT (c, DW_AT_data_bit_offset))
-(*el)->lf_member.type = create_bitfield (c);
+el->lf_member.type = create_bitfield (c);
   else
-(*el)->lf_member.type = get_type_num (get_AT_ref (c, DW_AT_type),
- true, false);
+el->lf_member.type = get_type_num (type, true, false);
 
-  (*el)->lf_member.offset.neg = false;
-  (*el)->lf_member.offset.num = get_AT_unsigned (c, 
DW_AT_data_member_location);
+  el->lf_member.offset.neg = false;
+  el->lf_member.offset.num = offset;
 
-  *el_len = 11 + cv_integer_len (&(*el)->lf_member.offset);
+  el_len = 11 + cv_integer_len (&el->lf_member.offset);
 
   if (get

[PATCH 1/2] Produce CodeView info about nested types

2024-11-17 Thread Mark Harmstone

If the DIE for a struct, class, or union contains a nested type, add a
LF_NESTTYPE entry to its field list recording this.

Plus if we use a nested type, make sure that its parent also gets
defined. This may entail adding a forward definition and creating a
deferred type, so we need to call flush_deferred_types in
codeview_debug_finish as well.

gcc/
* dwarf2codeview.cc (enum cv_leaf_type): Add LF_NESTTYPE.
(struct codeview_subtype): Add lf_nesttype to union.
(flush_deferred_types): Add declaration.
(write_lf_fieldlist): Handle LF_NESTTYPE.
(codeview_debug_finish): Call flush_deferred_types.
(add_struct_nested_type): New function.
(get_type_num_struct): Call add_struct_nested_type, and if nested make
that parent is added.
---
(This doesn't logically depend on my pending S_INLINESITE patches, but
does if you are attempting to apply this cleanly. I'm getting this out
before the code freeze.)

 gcc/dwarf2codeview.cc | 89 ++-
 1 file changed, 88 insertions(+), 1 deletion(-)

diff --git a/gcc/dwarf2codeview.cc b/gcc/dwarf2codeview.cc
index 08fbe7f5bb6..261fcea6a97 100644
--- a/gcc/dwarf2codeview.cc
+++ b/gcc/dwarf2codeview.cc
@@ -116,6 +116,7 @@ enum cv_leaf_type {
   LF_MEMBER = 0x150d,
   LF_STMEMBER = 0x150e,
   LF_METHOD = 0x150f,
+  LF_NESTTYPE = 0x1510,
   LF_ONEMETHOD = 0x1511,
   LF_FUNC_ID = 0x1601,
   LF_MFUNC_ID = 0x1602,
@@ -1285,6 +1286,11 @@ struct codeview_subtype
   uint32_t base_class_type;
   codeview_integer offset;
 } lf_bclass;
+struct
+{
+  uint32_t type;
+  char *name;
+} lf_nesttype;
   };
 };
 
@@ -1497,6 +1503,7 @@ static uint32_t get_type_num_subroutine_type (dw_die_ref 
type, bool in_struct,
 static void write_cv_padding (size_t padding);
 static uint32_t get_func_id (dw_die_ref die);
 static void write_inlinesite_records (dw_die_ref func, dw_die_ref die);
+static void flush_deferred_types (void);
 
 /* Return the file ID corresponding to a given source filename.  */
 
@@ -4364,6 +4371,40 @@ write_lf_fieldlist (codeview_custom_type *t)
  write_cv_padding (4 - (leaf_len % 4));
  break;
 
+   case LF_NESTTYPE:
+ /* This is lf_nest_type in binutils and lfNestType in Microsoft's
+cvinfo.h:
+
+   struct lf_nest_type
+   {
+ uint16_t kind;
+ uint16_t padding;
+ uint32_t type;
+ char name[];
+   } ATTRIBUTE_PACKED;
+ */
+
+ fputs (integer_asm_op (2, false), asm_out_file);
+ fprint_whex (asm_out_file, LF_NESTTYPE);
+ putc ('\n', asm_out_file);
+
+ fputs (integer_asm_op (2, false), asm_out_file);
+ fprint_whex (asm_out_file, 0);
+ putc ('\n', asm_out_file);
+
+ fputs (integer_asm_op (4, false), asm_out_file);
+ fprint_whex (asm_out_file, v->lf_nesttype.type);
+ putc ('\n', asm_out_file);
+
+ name_len = strlen (v->lf_nesttype.name) + 1;
+ ASM_OUTPUT_ASCII (asm_out_file, v->lf_nesttype.name, name_len);
+
+ leaf_len = 8 + name_len;
+ write_cv_padding (4 - (leaf_len % 4));
+
+ free (v->lf_nesttype.name);
+ break;
+
default:
  break;
}
@@ -5137,6 +5178,12 @@ codeview_debug_finish (void)
 
   write_codeview_symbols ();
 
+  /* If we reference a nested struct but not its parent, add_deferred_type
+ gets called if we create a forward reference for this, even though we've
+ already flushed this in codeview_debug_early_finish.  In this case we will
+ need to flush this list again.  */
+  flush_deferred_types ();
+
   if (custom_types)
 write_custom_types ();
 
@@ -6135,6 +6182,32 @@ is_templated_func (dw_die_ref die)
   return false;
 }
 
+/* Create a field list subtype that records that a struct has a nested type
+   contained within it.  */
+
+static void
+add_struct_nested_type (dw_die_ref c, codeview_subtype **el, size_t *el_len)
+{
+  const char *name = get_AT_string (c, DW_AT_name);
+  size_t name_len;
+
+  if (!name)
+return;
+
+  name_len = strlen (name);
+
+  *el = (codeview_subtype *) xmalloc (sizeof (**el));
+  (*el)->next = NULL;
+  (*el)->kind = LF_NESTTYPE;
+  (*el)->lf_nesttype.type = get_type_num (c, true, false);
+  (*el)->lf_nesttype.name = xstrdup (name);
+
+  *el_len = 9 + name_len;
+
+  if (*el_len % 4)
+*el_len += 4 - (*el_len % 4);
+}
+
 /* Process a DW_TAG_structure_type, DW_TAG_class_type, or DW_TAG_union_type
DIE, add an LF_FIELDLIST and an LF_STRUCTURE / LF_CLASS / LF_UNION type,
and return the number of the latter.  */
@@ -6142,11 +6215,18 @@ is_templated_func (dw_die_ref die)
 static uint32_t
 get_type_num_struct (dw_die_ref type, bool in_struct, bool *is_fwd_ref)
 {
-  dw_die_ref first_child;
+  dw_die_ref parent, first_child;
   codeview_custom_type *ct;
   uint16_t num_members = 0;
   uint32_t last_type = 0;
 
+  parent = dw_get_die_parent(type);

Re: [patch,avr] PR84211: Add a new post reload optimization pass

2024-11-17 Thread Denis Chertykov

вс, 17 нояб. 2024 г. в 18:42, Georg-Johann Lay :
>
> This introduces a new post reload pass that tracks known values held
> in registers and performs optimizations based on that knowledge.
>
> It runs between the two instances of the RTL peephole pass.
[...]
> (Memento, AbsInt, AbsInsByte, AbsIntVal)
> (OptimizaData, InsnOptimizeData, FindPliesData)
> (InsnInfo, BBInfo, Ply, Plies): New structs / classes.

IMHO: this names doesn't conform to GNU coding standards.

Denis.

[committed] hppa: Remove typedef for bool type

2024-11-17 Thread John David Anglin

Tested on hppa-unknown-linux-gnu.  Committed to trunk.

Dave
---

hppa: Remove typedef for bool type

In C23, bool is now a keyword.  So, doing a typedef for it is invalid.

2024-11-17  John David Anglin  

libgcc/ChangeLog:

PR target/117627
* config/pa/linux-atomic.c: Remove typedef for bool type.

diff --git a/libgcc/config/pa/linux-atomic.c b/libgcc/config/pa/linux-atomic.c
index 03ebccfc070..6191f83ed1c 100644
--- a/libgcc/config/pa/linux-atomic.c
+++ b/libgcc/config/pa/linux-atomic.c
@@ -264,8 +264,6 @@ OP_AND_FETCH_WORD (and,   , &)
 OP_AND_FETCH_WORD (xor,   , ^)
 OP_AND_FETCH_WORD (nand, ~, &)
 
-typedef unsigned char bool;
-
 #define COMPARE_AND_SWAP_2(TYPE, WIDTH, INDEX) \
   TYPE HIDDEN  \
   __sync_val_compare_and_swap_##WIDTH (volatile void *ptr, TYPE oldval,
\


signature.asc
Description: PGP signature

[PATCH] testsuite: Fix pr101145inf*.c testcases [PR117494]

2024-11-17 Thread Andrew Pinski

Instead of doing a dg-run with a specific target check for linux.
Use signal as the effective-target since this requires the use
of ALARM signal to do the testing.
Also use check_vect in the main and renames main to main1 to make sure
we don't use the registers.

Tested on x86_64-linux-gnu.

PR testsuite/117494
gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr101145inf.c: Remove dg-do and replace
with dg-require-effective-target of signal.
* gcc.dg/vect/pr101145inf_1.c: Likewise.
* gcc.dg/vect/pr101145inf.inc: Rename main to main1
and mark as noinline.
Include tree-vect.h. Have main call check_vect and main1.

Signed-off-by: Andrew Pinski 
---
 gcc/testsuite/gcc.dg/vect/pr101145inf.c   | 2 +-
 gcc/testsuite/gcc.dg/vect/pr101145inf.inc | 9 -
 gcc/testsuite/gcc.dg/vect/pr101145inf_1.c | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr101145inf.c 
b/gcc/testsuite/gcc.dg/vect/pr101145inf.c
index 3ad8c1a2dd7..aa598875aa5 100644
--- a/gcc/testsuite/gcc.dg/vect/pr101145inf.c
+++ b/gcc/testsuite/gcc.dg/vect/pr101145inf.c
@@ -1,4 +1,4 @@
-/* { dg-do run { target *-*-linux* *-*-gnu* *-*-uclinux* } } */
+/* { dg-require-effective-target signal } */
 /* { dg-additional-options "-O3" } */
 #include 
 #include "pr101145inf.inc"
diff --git a/gcc/testsuite/gcc.dg/vect/pr101145inf.inc 
b/gcc/testsuite/gcc.dg/vect/pr101145inf.inc
index 4aa3d049187..eb855b9881a 100644
--- a/gcc/testsuite/gcc.dg/vect/pr101145inf.inc
+++ b/gcc/testsuite/gcc.dg/vect/pr101145inf.inc
@@ -1,6 +1,7 @@
 #include 
 #include 
 #include 
+#include "tree-vect.h"
 
 void test_finite ();
 void test_infinite ();
@@ -10,7 +11,8 @@ void do_exit (int i)
   exit (0);
 }
 
-int main(void)
+__attribute__((noinline))
+int main1(void)
 {
   test_finite ();
   struct sigaction s;
@@ -26,3 +28,8 @@ int main(void)
   return 1;
 }
 
+int main(void)
+{
+  check_vect ();
+  return main1();
+}
diff --git a/gcc/testsuite/gcc.dg/vect/pr101145inf_1.c 
b/gcc/testsuite/gcc.dg/vect/pr101145inf_1.c
index e3e9dd46d10..0465788c3cc 100644
--- a/gcc/testsuite/gcc.dg/vect/pr101145inf_1.c
+++ b/gcc/testsuite/gcc.dg/vect/pr101145inf_1.c
@@ -1,4 +1,4 @@
-/* { dg-do run { target *-*-linux* *-*-gnu* *-*-uclinux* } } */
+/* { dg-require-effective-target signal } */
 /* { dg-additional-options "-O3" } */
 #include 
 #include "pr101145inf.inc"
-- 
2.43.0

PING: [PATCH v3 1/7] Improve outgoing integer argument promotion

2024-11-17 Thread H.J. Lu

On Sun, Nov 10, 2024 at 8:55 PM H.J. Lu  wrote:
>
> For targets, like x86, which define TARGET_PROMOTE_PROTOTYPES to return
> true, all integer arguments smaller than int are passed as int:
>
> [hjl@gnu-tgl-3 pr14907]$ cat x.c
> extern int baz (char c1);
>
> int
> foo (char c1)
> {
>   return baz (c1);
> }
> [hjl@gnu-tgl-3 pr14907]$ gcc -S -O2 -m32 x.c
> [hjl@gnu-tgl-3 pr14907]$ cat x.s
> .file   "x.c"
> .text
> .p2align 4
> .globl  foo
> .type   foo, @function
> foo:
> .LFB0:
> .cfi_startproc
> movsbl  4(%esp), %eax
> movl%eax, 4(%esp)
> jmp baz
> .cfi_endproc
> .LFE0:
> .size   foo, .-foo
> .ident  "GCC: (GNU) 14.2.1 20240912 (Red Hat 14.2.1-3)"
> .section.note.GNU-stack,"",@progbits
> [hjl@gnu-tgl-3 pr14907]$
>
> But integer promotion:
>
> movsbl  4(%esp), %eax
> movl%eax, 4(%esp)
>
> isn't necessary if incoming arguments and outgoing arguments are the
> same.  Drop targetm.promote_prototypes from C, C++ and Ada frontends
> and apply targetm.promote_prototypes during RTL call expansion.

PING.

> gcc/
>
> PR middle-end/14907
> * calls.cc: Include "ssa.h", "tree-ssa-live.h" and
> "tree-outof-ssa.h".
> (get_promoted_int_value_from_ssa_name): New function.
> (get_promoted_int_value): Likewise.
> (initialize_argument_information): Call get_promoted_int_value
> to promote integer function argument.
> * gimple.cc (gimple_builtin_call_types_compatible_p): Remove the
> targetm.calls.promote_prototypes call.
> * tree.cc (tree_builtin_call_types_compatible_p): Likewise.
>
> gcc/ada/
>
> PR middle-end/14907
> * gcc-interface/utils.cc (create_param_decl): Remove the
> targetm.calls.promote_prototypes call.
>
> gcc/c/
>
> PR middle-end/14907
> * c-decl.cc (start_decl): Remove the
> targetm.calls.promote_prototypes call.
> (store_parm_decls_oldstyle): Likewise.
> (finish_function): Likewise.
> * c-typeck.cc (convert_argument): Likewise.
> (c_safe_arg_type_equiv_p): Likewise.
>
> gcc/cp/
>
> PR middle-end/14907
> * call.cc (type_passed_as): Remove the
> targetm.calls.promote_prototypes call.
> (convert_for_arg_passing): Likewise.
> * typeck.cc (cxx_safe_arg_type_equiv_p): Likewise.
>
> gcc/testsuite/
>
> PR middle-end/14907
> * gcc.target/i386/pr14907-1.c: New test.
> * gcc.target/i386/pr14907-2.c: Likewise.
> * gcc.target/i386/pr14907-3.c: Likewise.
> * gcc.target/i386/pr14907-4.c: Likewise.
> * gcc.target/i386/pr14907-5.c: Likewise.
> * gcc.target/i386/pr14907-6.c: Likewise.
> * gcc.target/i386/pr14907-7.c: Likewise.
> * gcc.target/i386/pr14907-8.c: Likewise.
> * gcc.target/i386/pr14907-9.c: Likewise.
> * gcc.target/i386/pr14907-10.c: Likewise.
> * gcc.target/i386/pr14907-11.c: Likewise.
> * gcc.target/i386/pr14907-12.c: Likewise.
> * gcc.target/i386/pr14907-13.c: Likewise.
> * gcc.target/i386/pr14907-14.c: Likewise.
> * gcc.target/i386/pr14907-15.c: Likewise.
> * gcc.target/i386/pr14907-16.c: Likewise.
> * gfortran.dg/pr14907-1.f90: Likewise.
>
> Signed-off-by: H.J. Lu 
> ---
>  gcc/ada/gcc-interface/utils.cc | 24 ---
>  gcc/c/c-decl.cc| 40 ---
>  gcc/c/c-typeck.cc  | 19 ++---
>  gcc/calls.cc   | 81 ++
>  gcc/cp/call.cc | 10 ---
>  gcc/cp/typeck.cc   | 13 ++--
>  gcc/gimple.cc  | 10 +--
>  gcc/testsuite/gcc.target/i386/pr14907-1.c  | 21 ++
>  gcc/testsuite/gcc.target/i386/pr14907-10.c | 23 ++
>  gcc/testsuite/gcc.target/i386/pr14907-11.c | 12 
>  gcc/testsuite/gcc.target/i386/pr14907-12.c | 17 +
>  gcc/testsuite/gcc.target/i386/pr14907-13.c | 12 
>  gcc/testsuite/gcc.target/i386/pr14907-14.c | 17 +
>  gcc/testsuite/gcc.target/i386/pr14907-15.c | 26 +++
>  gcc/testsuite/gcc.target/i386/pr14907-16.c | 24 +++
>  gcc/testsuite/gcc.target/i386/pr14907-2.c  | 21 ++
>  gcc/testsuite/gcc.target/i386/pr14907-3.c  | 21 ++
>  gcc/testsuite/gcc.target/i386/pr14907-4.c  | 21 ++
>  gcc/testsuite/gcc.target/i386/pr14907-5.c  | 21 ++
>  gcc/testsuite/gcc.target/i386/pr14907-6.c  | 21 ++
>  gcc/testsuite/gcc.target/i386/pr14907-7.c  | 22 ++
>  gcc/testsuite/gcc.target/i386/pr14907-8.c  | 23 ++
>  gcc/testsuite/gcc.target/i386/pr14907-9.c  | 22 ++
>  gcc/testsuite/gfortran.dg/pr14907-1.f90| 17 +
>  gcc/tree.cc| 14 
>  25 files changed, 431 insertions(+), 121 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr14907-1.c
>  create mode 100

[PATCH] Fortran: add bounds-checking for ALLOCATE of CHARACTER with type-spec [PR53357]

2024-11-17 Thread Harald Anlauf

Dear all,

the attached patch fixes a rejects-valid / rejects-potentially-valid code issue
for  ALLOCATE of CHARACTER with type-spec, and add character length checking
with -fcheck=bounds for the case at hand.  It also improves checking of
character function declarations and references slightly, using the diagnostics
of NAG as a guidance.

Some testcases popped up during regtesting, suggesting that one needs to be
careful not to generate too many false positives, so I decided to not spend
to much time on the FIXME's therein.  (Character length might be expressions
in an explicit interface and the actual declaration, where we don't have a
reliable way to compare.)

Regtested on x86_64-pc-linux-gnu.  OK for mainline?

Thanks,
Harald

From d09473af7e25c81bad95ff6c66c89e2d184147e6 Mon Sep 17 00:00:00 2001
From: Harald Anlauf 
Date: Sun, 17 Nov 2024 23:04:58 +0100
Subject: [PATCH] Fortran: add bounds-checking for ALLOCATE of CHARACTER with
 type-spec [PR53357]

Fix a rejects-(potentially)-valid code for ALLOCATE of CHARACTER with
type-spec, and implement a string-length check for -fcheck=bounds.
Implement more detailed errors or warnings when character function
declarations and references do not match.

	PR fortran/53357

gcc/fortran/ChangeLog:

	* dependency.cc (gfc_dep_compare_expr): Return correct result if
	relationship of expressions could not be determined.
	* interface.cc (gfc_check_result_characteristics): Implement error
	messages if character function declations and references do not
	agree, else emit warning in cases where a mismatch is suspected.
	* trans-stmt.cc (gfc_trans_allocate): Implement a string length
	check for -fcheck=bounds.

gcc/testsuite/ChangeLog:

	* gfortran.dg/auto_char_len_4.f90: Adjust patterns.
	* gfortran.dg/typebound_override_1.f90: Likewise.
	* gfortran.dg/bounds_check_strlen_10.f90: New test.
---
 gcc/fortran/dependency.cc |  2 +-
 gcc/fortran/interface.cc  | 27 ---
 gcc/fortran/trans-stmt.cc | 11 
 gcc/testsuite/gfortran.dg/auto_char_len_4.f90 | 25 -
 .../gfortran.dg/bounds_check_strlen_10.f90| 21 +++
 .../gfortran.dg/typebound_override_1.f90  |  4 +--
 6 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/bounds_check_strlen_10.f90

diff --git a/gcc/fortran/dependency.cc b/gcc/fortran/dependency.cc
index 2d3db9541bb..1fd65bbadca 100644
--- a/gcc/fortran/dependency.cc
+++ b/gcc/fortran/dependency.cc
@@ -474,7 +474,7 @@ gfc_dep_compare_expr (gfc_expr *e1, gfc_expr *e2)
   }

   if (e1->expr_type != e2->expr_type)
-return -3;
+return -2;

   switch (e1->expr_type)
 {
diff --git a/gcc/fortran/interface.cc b/gcc/fortran/interface.cc
index 61c506bfdb5..176c7d4a8ed 100644
--- a/gcc/fortran/interface.cc
+++ b/gcc/fortran/interface.cc
@@ -1692,9 +1692,30 @@ gfc_check_result_characteristics (gfc_symbol *s1, gfc_symbol *s2,
 	  return false;

 	case -2:
-	  /* FIXME: Implement a warning for this case.
-	  snprintf (errmsg, err_len, "Possible character length mismatch "
-			"in function result");*/
+	  if (r1->ts.u.cl->length->expr_type == EXPR_CONSTANT)
+		{
+		  snprintf (errmsg, err_len,
+			"Function declared with a non-constant character "
+			"length referenced with a constant length");
+		  return false;
+		}
+	  else if (r2->ts.u.cl->length->expr_type == EXPR_CONSTANT)
+		{
+		  snprintf (errmsg, err_len,
+			"Function declared with a constant character "
+			"length referenced with a non-constant length");
+		  return false;
+		}
+	  /* Warn if length expression types are different, except for
+		  possibly false positives where complex expressions might have
+		  been used.  */
+	  else if ((r1->ts.u.cl->length->expr_type
+			!= r2->ts.u.cl->length->expr_type)
+		   && (r1->ts.u.cl->length->expr_type != EXPR_OP
+			   || r2->ts.u.cl->length->expr_type != EXPR_OP))
+		gfc_warning (0, "Possible character length mismatch in "
+			 "function result between %L and %L",
+			 &r1->declared_at, &r2->declared_at);
 	  break;

 	case 0:
diff --git a/gcc/fortran/trans-stmt.cc b/gcc/fortran/trans-stmt.cc
index 520ab505659..a409c25b899 100644
--- a/gcc/fortran/trans-stmt.cc
+++ b/gcc/fortran/trans-stmt.cc
@@ -6393,6 +6393,7 @@ gfc_trans_allocate (gfc_code * code, gfc_omp_namelist *omp_allocate)
   gfc_symtree *newsym = NULL;
   symbol_attribute caf_attr;
   gfc_actual_arglist *param_list;
+  tree ts_string_length = NULL_TREE;

   if (!code->ext.alloc.list)
 return NULL_TREE;
@@ -6741,6 +6742,7 @@ gfc_trans_allocate (gfc_code * code, gfc_omp_namelist *omp_allocate)
 	  gfc_init_se (&se_sz, NULL);
 	  gfc_conv_expr (&se_sz, sz);
 	  gfc_free_expr (sz);
+	  ts_string_length = fold_convert (gfc_charlen_type_node, se_sz.expr);
 	  tmp = gfc_get_char_type (code->ext.alloc.ts.kind);
 	  tmp = TYPE_SIZE_UNIT (tmp);
 	  tmp = fold_convert (TREE_TYPE (se_

Re: [PATCH v2] RISC-V: Improve vsetvl vconfig alignment

2024-11-17 Thread Jeff Law





On 11/15/24 3:25 AM, Robin Dapp wrote:

So this is really the biggest question in my mind. When we kicked this
around in the patchwork meeting several weeks ago I got the impression
Robin had a correctness concern with this code.  Robin, do you remember
what had you worried?


Unfortunately I haven't managed to get back to this in weeks now.
I vaguely recall that it was something related to "changed" as well but
it looks you have that covered already.  I'd say let's go ahead with
that fixed.

I poked around a bit more at this.

It appears to me that at least part of the problem is we're trying to 
merge vsetvl information when prev_info and curr_info are the same 
object.  At least that's the way it looked to me with some light 
debugging.   Naturally this leads to an infinite loop continually making 
no real changes to the same node.


Second, my initial impression was that we should do this regardless of 
whether or not the avls were compatible.  But after reviewing the 
codegen changes that result from such a change, I'm less sure.  I didn't 
see any cases where codegen got better, but I did see a few where it was 
clearly worse.


I think Dusan really needs to chime in here or this can't reasonably go 
forward.  I'm going to mark it as deferred in patchwork until we hear 
from Dusan.


jeff

Re: [PATCH 2/3] Node clones share order.

2024-11-17 Thread Jan Hubicka

> Symbol order corresponds to the order in source code.
> For clones their order is currently arbitrarily chosen as max order++
> But it would be more consistent with original purpose to choose clones
> order to be shared with the original node order.
> This stabilizes clone order for Incremental LTO.
> 
> Order is thus no longer unique, but this property is not used outside
> of previous patch, where we can use uid.
> If total order would be needed, sorting by order and then uid suffices.
> 
> gcc/ChangeLog:
> 
>   * cgraph.h (symbol_table::register_symbol):
> Order can be already set.
>   * cgraphclones.cc (cgraph_node::create_clone):
> Reuse order for clones.
OK, thanks!
Honza

Re: [PATCH 3/3] incremental lto: Remap node order for stability.

2024-11-17 Thread Jan Hubicka

> This patch adds remapping of node order for each lto partition.
> Resulting order conserves relative order inside partition, but
> is independent of outside symbols. So if lto partition contains
> identical set of symbols, their remapped order will be stable
> between compilations.
> 
> gcc/ChangeLog:
> 
>   * ipa-devirt.cc (ipa_odr_summary_write):
>   Add unused argument.
>   * ipa-fnsummary.cc (ipa_fn_summary_write): Likewise.
>   * ipa-icf.cc (sem_item_optimizer::write_summary): Likewise.
>   * ipa-modref.cc (modref_write): Likewise.
>   * ipa-prop.cc (ipa_prop_write_jump_functions): Likewise.
>   (ipcp_write_transformation_summaries): Likewise.
>   * ipa-sra.cc (ipa_sra_write_summary): Likewise.
>   * lto-cgraph.cc (lto_symtab_encoder_delete): Delete remap.
>   (lto_output_node): Remap order.
>   (lto_output_varpool_node): Likewise.
>   (output_cgraph_opt_summary): Add unused argument.
>   * lto-streamer-out.cc (produce_asm): Use remapped order.
>   (output_function): Propagate remapped order.
>   (output_constructor): Likewise.
>   (copy_function_or_variable): Likewise.
>   (cmp_int): New.
>   (create_order_remap): New.
>   (lto_output): Create remap. Remap order.
>   * lto-streamer.h (struct lto_symtab_encoder_d): Remap hash_map.
>   (produce_asm): Add order argument.

> ---
>  gcc/ipa-devirt.cc   |  2 +-
>  gcc/ipa-fnsummary.cc|  2 +-
>  gcc/ipa-icf.cc  |  2 +-
>  gcc/ipa-modref.cc   |  4 +-
>  gcc/ipa-prop.cc |  4 +-
>  gcc/ipa-sra.cc  |  2 +-
>  gcc/lto-cgraph.cc   | 10 +++--
>  gcc/lto-streamer-out.cc | 84 +++--
>  gcc/lto-streamer.h  |  5 ++-
>  9 files changed, 91 insertions(+), 24 deletions(-)
> 
> diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
> index c406e5138db..098798281b7 100644
> --- a/gcc/ipa-devirt.cc
> +++ b/gcc/ipa-devirt.cc
> @@ -4131,7 +4131,7 @@ ipa_odr_summary_write (void)
>odr_enum_map = NULL;
>  }
>  
> -  produce_asm (ob, NULL);
> +  produce_asm (ob, NULL, -1);
Arguments of produce_asm seems somewhat magical.
> +/* Compare ints, callback for qsort.  */
> +
> +static int
> +cmp_int (const void *a, const void *b)
> +{
> +  int ia = *(int const*) a;
> +  int ib = *(int const*) b;
> +  return ia - ib;
> +}
Given that it is no longer C source, perhaps std::sort would allow doing
this without extra comparator.  But I am fine with the function as it is :)
> +extern void produce_asm (struct output_block *ob, tree fn, int output_order);

I would suggest renaming produce_asm to produce_symbol_asm 
and making produce_asm wrapper which passes fn=NULL and output_order=-1,
so we do not have odd parameters everywhere in streaming code.

OK with this change.
Honza

Re: [PATCH RFC] libcpp, libstdc++: add __has_import builtin

2024-11-17 Thread Jason Merrill


On 11/15/24 6:30 PM, Ville Voutilainen wrote:

On Sat, 16 Nov 2024 at 01:12, Jason Merrill  wrote:


Does this seem like an interesting direction?

-- 8< --

A problem with coexistence of module std and the library headers is that
import and then #include tends to break (PR99000).  But even with that
fixed, it might be useful to be able to test whether a module has been
imported.  So, this patch implements __has_import, along the same lines as
__has_builtin and such.

This does not test whether an import is available, which seems too variable
of a property; rather, it tests whether we've already seen an import in this
TU.


Should it be named  __seen_import? All those __has-things seem to be 
feature-tests,
and this isn't. Or to put it another way, the __has-things query properties of 
the implementation,
this builtin is for querying what has been encountered in the TU so far. That 
sounds like
apples and oranges, right?


Yeah, you're probably right; __has_include comes before the #include, 
this comes after the import, so we using __has for both could be misleading.


Jason

[pushed v2] c-family: add -fsearch-include-path

2024-11-17 Thread Jason Merrill

Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

The C++ modules code has a -fmodule-header (or -x c++-{user,system}-header)
option to specify looking up headers to compile to header units on the usual
include paths.  I'd like to have the same functionality for full C++20
modules such as module std, which I proposed to live on the include path at
bits/std.cc.  But this behavior doesn't seem necessarily connected to
modules, so I'm proposing a general C/C++ option to specify the behavior of
looking in the include path for the input files specified on the command
line.

Other ideas for the name of the option are very welcome.

The libcpp change is to allow -fsearch-include-path{,=user} to find files in
the current working directory, like -include.  This can be handy for a quick
compile of both std.cc and a file that imports it, e.g.

g++ -std=c++20 -fmodules -fsearch-include-path bits/std.cc importer.cc

gcc/ChangeLog:

* doc/cppopts.texi: Document -fsearch-include-path.
* doc/invoke.texi: Mention it for modules.

gcc/c-family/ChangeLog:

* c.opt: Add -fsearch-include-path.
* c-opts.cc (c_common_post_options): Handle it.

gcc/cp/ChangeLog:

* module.cc (module_preprocess_options): Don't override it.

libcpp/ChangeLog:

* internal.h (search_path_head): Declare.
* files.cc (search_path_head): No longer static.
* init.cc (cpp_read_main_file): Use it.
---
 gcc/doc/cppopts.texi   | 15 +++
 gcc/doc/invoke.texi|  5 +
 gcc/c-family/c.opt |  7 +++
 libcpp/internal.h  |  2 ++
 gcc/c-family/c-opts.cc | 13 +
 gcc/cp/module.cc   |  3 ++-
 libcpp/files.cc|  5 +
 libcpp/init.cc | 12 +---
 8 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/gcc/doc/cppopts.texi b/gcc/doc/cppopts.texi
index 5b5b0848ae8..748db5ea579 100644
--- a/gcc/doc/cppopts.texi
+++ b/gcc/doc/cppopts.texi
@@ -270,6 +270,21 @@ When preprocessing, do not shorten system header paths 
with canonicalization.
 @item -fmax-include-depth=@var{depth}
 Set the maximum depth of the nested #include. The default is 200. 
 
+@opindex fsearch-include-path
+@item -fsearch-include-path@r{[}=@var{kind}@r{]}
+Look for input files on the #include path, not just the current
+directory.  This is particularly useful with C++20 modules, for which
+both header units and module interface units need to be compiled
+directly:
+
+@smallexample
+g++ -c -std=c++20 -fmodules -fsearch-include-path bits/stdc++.h bits/std.cc
+@end smallexample
+
+@var{kind} defaults to @samp{user}, which looks on the @code{#include
+"@dots{}"} search path; you can also explicitly specify @samp{system}
+for the @code{#include <@dots{}>} search path.
+
 @opindex ftabstop
 @item -ftabstop=@var{width}
 Set the distance between tab stops.  This helps the preprocessor report
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 929feaf23fb..8aeccb1953a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -38001,6 +38001,11 @@ installed.  Specifying the language as one of these 
variants also
 inhibits output of the object file, as header files have no associated
 object file.
 
+Alternately, or for a module interface unit in an installed location,
+you can use @option{-fsearch-include-path} to specify that the main
+source file should be found on the include path rather than the
+current directory.
+
 Header units can be used in much the same way as precompiled headers
 (@pxref{Precompiled Headers}), but with fewer restrictions: an
 #include that is translated to a header unit import can appear at any
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index 61cfe33c251..8224c82bfdf 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -2280,6 +2280,13 @@ frtti
 C++ ObjC++ Optimization Var(flag_rtti) Init(1)
 Generate run time type descriptor information.
 
+fsearch-include-path
+C ObjC C++ ObjC++
+Look for the main source file on the include path.
+
+fsearch-include-path=
+C++ ObjC++ Joined RejectNegative Undocumented
+
 fshort-enums
 C ObjC C++ ObjC++ LTO Optimization Var(flag_short_enums)
 Use the narrowest integer type possible for enumeration types.
diff --git a/libcpp/internal.h b/libcpp/internal.h
index e65198e89da..d91acd64ba3 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -766,6 +766,8 @@ extern _cpp_file *_cpp_find_file (cpp_reader *, const char 
*, cpp_dir *,
  int angle, _cpp_find_file_kind, location_t);
 extern bool _cpp_find_failed (_cpp_file *);
 extern void _cpp_mark_file_once_only (cpp_reader *, struct _cpp_file *);
+extern cpp_dir *search_path_head (cpp_reader *, const char *, int,
+ include_type, bool = false);
 extern const char *_cpp_find_header_unit (cpp_reader *, const char *file,
  bool angle_p,  location_t);
 extern int _cpp_stack_embed (cpp_reader *, const char *, bool,
diff --git a/gcc/c-fam

Re: [r15-5336 Regression] FAIL: gcc.dg/guality/pr36728-4.c -Os -DPREVENT_OPTIMIZATION line 16 y == 2 on Linux/x86_64

2024-11-17 Thread Andrew Pinski

On Sat, Nov 16, 2024 at 9:27 AM haochen.jiang  wrote:
>
> On Linux/x86_64,
>
> cee7d080d5c2a5fb8125878998b742c040ec88b4 is the first bad commit
> commit cee7d080d5c2a5fb8125878998b742c040ec88b4
> Author: Jan Hubicka 
> Date:   Sat Nov 16 14:04:32 2024 +0100
>
> Ignore conditions guarding __builtin_unreachable in inliner metrics
>
> caused
>
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg1 
> == 1
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg2 
> == 2
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg3 
> == 3
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg4 
> == 4
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg5 
> == 5
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg6 
> == 6
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg7 
> == 30
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg1 
> == 1
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg2 
> == 2
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg3 
> == 3
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg4 
> == 4
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg5 
> == 5
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg6 
> == 6
> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg7 
> == 30
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg1 == 1
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg2 == 2
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg3 == 3
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg4 == 4
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg5 == 5
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg6 == 6
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg7 == 30
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg1 == 1
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg2 == 2
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg3 == 3
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg4 == 4
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg5 == 5
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg6 == 6
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg7 == 30
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg1 == 1
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg2 == 2
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg3 == 3
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg4 == 4
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg5 == 5
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg6 == 6
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg7 == 30
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg1 == 1
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg2 == 2
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg3 == 3
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg4 == 4
> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-pl

Re: [r15-5336 Regression] FAIL: gcc.dg/guality/pr36728-4.c -Os -DPREVENT_OPTIMIZATION line 16 y == 2 on Linux/x86_64

2024-11-17 Thread Richard Biener




> Am 17.11.2024 um 09:54 schrieb Andrew Pinski :
> 
> On Sat, Nov 16, 2024 at 9:27 AM haochen.jiang  
> wrote:
>> 
>> On Linux/x86_64,
>> 
>> cee7d080d5c2a5fb8125878998b742c040ec88b4 is the first bad commit
>> commit cee7d080d5c2a5fb8125878998b742c040ec88b4
>> Author: Jan Hubicka 
>> Date:   Sat Nov 16 14:04:32 2024 +0100
>> 
>>Ignore conditions guarding __builtin_unreachable in inliner metrics
>> 
>> caused
>> 
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg1 
>> == 1
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg2 
>> == 2
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg3 
>> == 3
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg4 
>> == 4
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg5 
>> == 5
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg6 
>> == 6
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 arg7 
>> == 30
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg1 
>> == 1
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg2 
>> == 2
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg3 
>> == 3
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg4 
>> == 4
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg5 
>> == 5
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg6 
>> == 6
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 arg7 
>> == 30
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg1 == 1
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg2 == 2
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg3 == 3
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg4 == 4
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg5 == 5
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg6 == 6
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg7 == 30
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg1 == 1
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg2 == 2
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg3 == 3
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg4 == 4
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg5 == 5
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg6 == 6
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg7 == 30
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg1 == 1
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg2 == 2
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg3 == 3
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg4 == 4
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg5 == 5
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg6 == 6
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg7 == 30
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg1 == 1
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg2 == 2
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg3 == 3
>> FAIL: gcc.dg/guality/pr36728-2.c   -O2

PATCH for Re: new mirror greece

2024-11-17 Thread Gerald Pfeifer

On Wed, 28 Aug 2024, Konstantinos Draziotis wrote:
> The issue I was experiencing seems to be resolved now. Please add it to the
> mirrors.
 Location : Thessaloniki / Greece
 Admin Name  : K. A. Draziotis
 Admin Email : drazi...@gmail.com
 Sponsor Name: Aristotle University of Thessaloniki
 Sponsor URL : https://auth.gr
 HTTP/HTTPS URL: fosszone.csd.auth.gr/gnu/gcc

Thank you, Konstantinos.

Here is the update I made to our mirrors page.

Gerald

commit 1b5645903bfac2945f2edf40f0d3e2aa2d3861a5
Author: Gerald Pfeifer 
Date:   Sun Nov 17 18:51:00 2024 +0100

mirrors: Add auth.gr

diff --git a/htdocs/mirrors.html b/htdocs/mirrors.html
index e9d1090a..309a3729 100644
--- a/htdocs/mirrors.html
+++ b/htdocs/mirrors.html
@@ -24,6 +24,7 @@ mirrors.  The following sites mirror the gcc.gnu.org 
download site
 Germany: https://ftp.mpi-inf.mpg.de/mirrors/gnu/mirror/gcc.gnu.org/pub/gcc/";>mpi-sb.mpg.de,
 thanks to ftpad...@mpi-sb.mpg.de
 Germany: http://gcc.cybermirror.org";>http://gcc.cybermirror.org, thanks to 
Sascha Schwarz (c...@cybermirror.org)
 Greece: http://ftp.ntua.gr/mirror/gnu/gcc/";>ntua.gr, thanks 
to ftp...@ntua.gr
+Greece: https://fosszone.csd.auth.gr/gnu/gcc/";>auth.gr 
(Aristotle University of Thessaloniki), thanks to K. A. Draziotis 

 Hungary, Budapest: http://robotlab.itk.ppke.hu/gcc/";>robotlab.itk.ppke.hu, thanks to 
Adam Rak (neur...@gmail.com)
 Japan: http://ftp.tsukuba.wide.ad.jp/software/gcc/";>ftp.tsukuba.wide.ad.jp, 
thanks to Kohei Takahashi (tsukuba-ftp-serv...@tsukuba.wide.ad.jp)
 Morocco:

Re: [r15-5359 Regression] FAIL: g++.dg/tree-ssa/pr109442.C -std=gnu++26 scan-tree-dump-not optimized "_M_start" on Linux/x86_64

2024-11-17 Thread Jan Hubicka

> On Linux/x86_64,
> 
> cc33f880e553d1aa94d19a349ad755f34c33de9e is the first bad commit
> commit cc33f880e553d1aa94d19a349ad755f34c33de9e
> Author: Jan Hubicka 
> Date:   Sat Nov 16 23:45:57 2024 +0100
> 
> Avoid expicit builtion list in tree-ssa-dce
> 
> caused
> 
> FAIL: g++.dg/tree-ssa/pr109442.C  -std=gnu++11  scan-tree-dump-not optimized 
> "_M_start"
> FAIL: g++.dg/tree-ssa/pr109442.C  -std=gnu++17  scan-tree-dump-not optimized 
> "_M_start"
> FAIL: g++.dg/tree-ssa/pr109442.C  -std=gnu++26  scan-tree-dump-not optimized 
> "_M_start"

Here I accidentally bundled testcase to wrong patch and it is fixed now
with the libstdc++ with rev aac5c57ee167230cea466064951daf06e42197b9

Honza

[PATCH] RISC-V: Add slide to perm_const strategies.

2024-11-17 Thread Robin Dapp

From: Robin Dapp 

This patch adds a shuffle_slide_patterns to expand_vec_perm_const.
It recognizes permutations like

  {0, 1, 4, 5}
or
  {2, 3, 6, 7}

which can be constructed by a slideup or slidedown of one of the vectors
into the other one.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (shuffle_slide_patterns): New.
(expand_vec_perm_const_1): Call new function.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide.c: New test.

RISC-V: Add interleave pattern.

This patch adds efficient handling of interleaving patterns like
[0 4 1 5] to vec_perm_const.  It is implemented by a slideup and a
gather.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (shuffle_interleave_patterns): New
function.
(expand_vec_perm_const_1): Use new function.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c: New 
test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c: New test.

RISC-V: Add even/odd vec_perm_const pattern.

This adds handling for even/odd patterns.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (shuffle_evenodd_patterns): New
function.
(expand_vec_perm_const_1): Use new function.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c: New 
test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c: New test.

RISC-V: Improve slide1up pattern.

This patch adds a second variant to implement the extract/slide1up
pattern.  In order to do a permutation like
<3, 4, 5, 6> from vectors <0, 1, 2, 3> and <4, 5, 6, 7>
we currently extract <3> from the first vector and re-insert it into the
second vector.  Unless register-file crossing latency is essentially
zero it should be preferable to first slide the second vector up by
one, then slide down the first vector by (nunits - 1).

gcc/ChangeLog:

* config/riscv/riscv-protos.h (riscv_register_move_cost):
Export.
* config/riscv/riscv-v.cc (shuffle_extract_and_slide1up_patterns):
Add slideup/slidedown variant.
* config/riscv/riscv.cc (riscv_secondary_memory_needed): Remove
static.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr112599-2.c: Adjust test
expectation.

varasm: Use PRECISION instead of SIZE [PR123456].

When optimizing the constant pool we currently don't handle
vector mask modes properly.  While emitting two compress
patterns {1, 0, 1, 0} (V4BI) and {1, 0, 1, 0, 1, 0, 1, 0} (V8BI)
I noticed that they would end up with the same constant pool entry
of "10" (={1, 0, 1, 0}).
This is because we compare MODE_SIZE elements instead of MODE_PRECISION
so the hash of both constants would be identical.

This patch uses GET_MODE_PRECISION instead and also fixes one similar
instance in simplify-rtx.

gcc/ChangeLog:

* simplify-rtx.cc (native_encode_rtx): Use GET_MODE_PRECISION.
* varasm.cc (optimize_constant_pool): Ditto.
---
 gcc/config/riscv/riscv-protos.h   |   1 +
 gcc/config/riscv/riscv-v.cc   | 287 +-
 gcc/config/riscv/riscv.cc |  18 +-
 gcc/simplify-rtx.cc   |   2 +-
 .../gcc.target/riscv/rvv/autovec/pr112599-2.c |   2 +-
 .../autovec/vls-vlmax/shuffle-evenodd-run.c   | 122 
 .../rvv/autovec/vls-vlmax/shuffle-evenodd.c   |  68 +
 .../vls-vlmax/shuffle-interleave-run.c| 122 
 .../autovec/vls-vlmax/shuffle-interleave.c|  69 +
 .../rvv/autovec/vls-vlmax/shuffle-slide-run.c | 266 
 .../rvv/autovec/vls-vlmax/shuffle-slide.c | 207 +
 gcc/varasm.cc |   2 +-
 12 files changed, 1152 insertions(+), 14 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 500b357f6eb..ecb4e64cdf8 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -139,6 +139,7 @@ extern void riscv_expand_ussub (rtx, rtx, rtx);
 extern void riscv_expand_sssub (rtx, rtx, rtx);
 extern void riscv_expand_ustrunc (rtx, rtx);
 extern void riscv_expand_sstrunc (rtx, rtx);
+extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t);
 
 #ifdef RTX_CODE
 extern void riscv_expand_int_scc (rtx, enum rtx_code,

[PATCH 3/4] RISC-V: Add even/odd vec_perm_const pattern.

2024-11-17 Thread Robin Dapp

From: Robin Dapp 

This adds handling for even/odd patterns.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (shuffle_evenodd_patterns): New
function.
(expand_vec_perm_const_1): Use new function.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c: New 
test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c: New test.
---
 gcc/config/riscv/riscv-v.cc   |  66 ++
 .../autovec/vls-vlmax/shuffle-evenodd-run.c   | 122 ++
 .../rvv/autovec/vls-vlmax/shuffle-evenodd.c   |  68 ++
 3 files changed, 256 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index d940b961bf8..4fb032af953 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3552,6 +3552,70 @@ shuffle_interleave_patterns (struct expand_vec_perm_d *d)
   return true;
 }
 
+
+/* Recognize even/odd patterns like [0 2 4 6].  We use two compress
+   and one slideup.j  */
+
+static bool
+shuffle_evenodd_patterns (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  poly_int64 vec_len = d->perm.length ();
+  int n_patterns = d->perm.encoding ().npatterns ();
+
+  if (n_patterns != 1)
+return false;
+
+  if (!vec_len.is_constant ())
+return false;
+
+  int vlen = vec_len.to_constant ();
+  if (vlen < 4 || vlen > 64)
+return false;
+
+  if (d->one_vector_p)
+return false;
+
+  bool even = true;
+  if (!d->perm.series_p (0, 1, 0, 2))
+{
+  even = false;
+  if (!d->perm.series_p (0, 1, 1, 2))
+   return false;
+}
+
+  /* Success!  */
+  if (d->testing_p)
+return true;
+
+  machine_mode mask_mode = get_mask_mode (vmode);
+  rvv_builder builder (mask_mode, vlen, 1);
+  int bit = even ? 0 : 1;
+  for (int i = 0; i < vlen; i++)
+{
+  bit ^= 1;
+  if (bit)
+   builder.quick_push (CONST1_RTX (BImode));
+  else
+   builder.quick_push (CONST0_RTX (BImode));
+}
+  rtx mask = force_reg (mask_mode, builder.build ());
+
+  insn_code icode = code_for_pred_compress (vmode);
+  rtx ops1[] = {d->target, d->op0, mask};
+  emit_vlmax_insn (icode, COMPRESS_OP, ops1);
+
+  rtx tmp2 = gen_reg_rtx (vmode);
+  rtx ops2[] = {tmp2, d->op1, mask};
+  emit_vlmax_insn (icode, COMPRESS_OP, ops2);
+
+  rtx ops[] = {d->target, d->target, tmp2, gen_int_mode (vlen / 2, Pmode)};
+  icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
+  emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
+
+  return true;
+}
+
 /* Recognize decompress patterns:
 
1. VEC_PERM_EXPR op0 and op1
@@ -3870,6 +3934,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
  if (shuffle_interleave_patterns (d))
return true;
+ if (shuffle_evenodd_patterns (d))
+   return true;
  if (shuffle_compress_patterns (d))
return true;
  if (shuffle_decompress_patterns (d))
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
new file mode 100644
index 000..c0760e5ed30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
@@ -0,0 +1,122 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target riscv_v_ok } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m8 -std=gnu99" } */
+
+#include "shuffle-evenodd.c"
+
+#define SERIES_2(x, y) (x), (x + 1)
+#define SERIES_4(x, y) SERIES_2 (x, y), SERIES_2 (x + 2, y)
+#define SERIES_8(x, y) SERIES_4 (x, y), SERIES_4 (x + 4, y)
+#define SERIES_16(x, y) SERIES_8 (x, y), SERIES_8 (x + 8, y)
+#define SERIES_32(x, y) SERIES_16 (x, y), SERIES_16 (x + 16, y)
+#define SERIES_64(x, y) SERIES_32 (x, y), SERIES_32 (x + 32, y)
+
+#define comp(a, b, n)  
\
+  for (unsigned i = 0; i < n; ++i) 
\
+if ((a)[i] != (b)[i])  
\
+  __builtin_abort ();
+
+#define CHECK1(TYPE, NUNITS)   
\
+  __attribute__ ((noipa)) void check1_##TYPE ()
\
+  {
\
+TYPE v0 = (TYPE){SERIES_##NUNITS (0, NUNITS)}; 
\
+TYPE v1 = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)};
\
+TYPE ref = (TYPE){MASKE_##NUNITS (0, NUNITS)}; 
\
+TYPE res;  
\
+permute1_##TYPE (v0, v1, &res);
\
+comp (res,

Re: [PATCH v4 0/8] Add maskload else operand.

2024-11-17 Thread Robin Dapp

Ugh... passed the wrong old cover letter to git send-email.
Nothing to do with the else operand, it's just the permutes
here and the proper cover letter is in 0/4...

-- 
Regards
 Robin

Re: [r15-5336 Regression] FAIL: gcc.dg/guality/pr36728-4.c -Os -DPREVENT_OPTIMIZATION line 16 y == 2 on Linux/x86_64

2024-11-17 Thread Jan Hubicka

> 
> 
> > Am 17.11.2024 um 09:54 schrieb Andrew Pinski :
> > 
> > On Sat, Nov 16, 2024 at 9:27 AM haochen.jiang  
> > wrote:
> >> 
> >> On Linux/x86_64,
> >> 
> >> cee7d080d5c2a5fb8125878998b742c040ec88b4 is the first bad commit
> >> commit cee7d080d5c2a5fb8125878998b742c040ec88b4
> >> Author: Jan Hubicka 
> >> Date:   Sat Nov 16 14:04:32 2024 +0100
> >> 
> >>Ignore conditions guarding __builtin_unreachable in inliner metrics
> >> 
> >> caused
> >> 
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 16 
> >> arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2  -DPREVENT_OPTIMIZATION  line 18 
> >> arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 16 arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fno-use-linker-plugin 
> >> -flto-partition=none  -DPREVENT_OPTIMIZATION line 18 arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg2 == 2
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg3 == 3
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg4 == 4
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg5 == 5
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg6 == 6
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 16 arg7 == 30
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg1 == 1
> >> FAIL: gcc.dg/guality/pr36728-2.c   -O2 -flto -fuse-linker-plugin 
> >> -fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 18 arg2

[PATCH 4/4] RISC-V: Improve slide1up pattern.

2024-11-17 Thread Robin Dapp

From: Robin Dapp 

This patch adds a second variant to implement the extract/slide1up
pattern.  In order to do a permutation like
<3, 4, 5, 6> from vectors <0, 1, 2, 3> and <4, 5, 6, 7>
we currently extract <3> from the first vector and re-insert it into the
second vector.  Unless register-file crossing latency is essentially
zero it should be preferable to first slide the second vector up by
one, then slide down the first vector by (nunits - 1).

gcc/ChangeLog:

* config/riscv/riscv-protos.h (riscv_register_move_cost):
Export.
* config/riscv/riscv-v.cc (shuffle_extract_and_slide1up_patterns):
Add slideup/slidedown variant.
* config/riscv/riscv.cc (riscv_secondary_memory_needed): Remove
static.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr112599-2.c: Adjust test
expectation.
---
 gcc/config/riscv/riscv-protos.h   |  1 +
 gcc/config/riscv/riscv-v.cc   | 44 ++-
 gcc/config/riscv/riscv.cc | 18 +++-
 .../gcc.target/riscv/rvv/autovec/pr112599-2.c |  2 +-
 4 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 500b357f6eb..ecb4e64cdf8 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -139,6 +139,7 @@ extern void riscv_expand_ussub (rtx, rtx, rtx);
 extern void riscv_expand_sssub (rtx, rtx, rtx);
 extern void riscv_expand_ustrunc (rtx, rtx);
 extern void riscv_expand_sstrunc (rtx, rtx);
+extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t);
 
 #ifdef RTX_CODE
 extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool 
*invert_ptr = 0);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 4fb032af953..76ee95d5b21 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3800,17 +3800,39 @@ shuffle_extract_and_slide1up_patterns (struct 
expand_vec_perm_d *d)
   if (d->testing_p)
 return true;
 
-  /* Extract the last element of the first vector.  */
-  scalar_mode smode = GET_MODE_INNER (d->vmode);
-  rtx tmp = gen_reg_rtx (smode);
-  emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
-
-  /* Insert the scalar into element 0.  */
-  unsigned int unspec
-= FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
-  insn_code icode = code_for_pred_slide (unspec, d->vmode);
-  rtx ops[] = {d->target, d->op1, tmp};
-  emit_vlmax_insn (icode, BINARY_OP, ops);
+  int scalar_cost = riscv_register_move_cost (d->vmode, V_REGS, GR_REGS)
++ riscv_register_move_cost (d->vmode, GR_REGS, V_REGS) + 2;
+  int slide_cost = 2;
+
+  if (slide_cost < scalar_cost)
+{
+  /* This variant should always be preferable because we just need two
+slides.  The extract-variant also requires two slides but additionally
+pays the latency for register-file crossing.  */
+  rtx tmp = gen_reg_rtx (d->vmode);
+  rtx ops[] = {tmp, d->op1, gen_int_mode (1, Pmode)};
+  insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, d->vmode);
+  emit_vlmax_insn (icode, BINARY_OP, ops);
+
+  rtx ops2[] = {d->target, tmp, d->op0, gen_int_mode (nunits - 1, Pmode)};
+  icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
+  emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops2, gen_int_mode (1, 
Pmode));
+}
+  else
+{
+  /* Extract the last element of the first vector.  */
+  scalar_mode smode = GET_MODE_INNER (d->vmode);
+  rtx tmp = gen_reg_rtx (smode);
+  emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
+
+  /* Insert the scalar into element 0.  */
+  unsigned int unspec
+   = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
+  insn_code icode = code_for_pred_slide (unspec, d->vmode);
+  rtx ops[] = {d->target, d->op1, tmp};
+  emit_vlmax_insn (icode, BINARY_OP, ops);
+}
+
   return true;
 }
 
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 7694954c4c5..62b80fefedd 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -9464,7 +9464,7 @@ riscv_secondary_memory_needed (machine_mode mode, 
reg_class_t class1,
 
 /* Implement TARGET_REGISTER_MOVE_COST.  */
 
-static int
+int
 riscv_register_move_cost (machine_mode mode,
  reg_class_t from, reg_class_t to)
 {
@@ -9472,6 +9472,22 @@ riscv_register_move_cost (machine_mode mode,
   (from == GR_REGS && to == FP_REGS))
 return tune_param->fmv_cost;
 
+  if (from == V_REGS)
+{
+  if (to == GR_REGS)
+   return get_vector_costs ()->regmove->VR2GR;
+  else if (to == FP_REGS)
+   return get_vector_costs ()->regmove->VR2FR;
+}
+
+  if (to == V_REGS)
+{
+  if (from == GR_REGS)
+   return get_vector_costs ()->regmove->GR2VR;
+  else if (from == FP_REGS)
+   return get_vector_costs ()->regmove->FR2VR;
+}

Re: [PATCH 1/2] Produce CodeView info about nested types

2024-11-17 Thread Jeff Law





On 11/17/24 9:01 AM, Mark Harmstone wrote:

If the DIE for a struct, class, or union contains a nested type, add a
LF_NESTTYPE entry to its field list recording this.

Plus if we use a nested type, make sure that its parent also gets
defined. This may entail adding a forward definition and creating a
deferred type, so we need to call flush_deferred_types in
codeview_debug_finish as well.

gcc/
* dwarf2codeview.cc (enum cv_leaf_type): Add LF_NESTTYPE.
(struct codeview_subtype): Add lf_nesttype to union.
(flush_deferred_types): Add declaration.
(write_lf_fieldlist): Handle LF_NESTTYPE.
(codeview_debug_finish): Call flush_deferred_types.
(add_struct_nested_type): New function.
(get_type_num_struct): Call add_struct_nested_type, and if nested make
that parent is added.
---
(This doesn't logically depend on my pending S_INLINESITE patches, but
does if you are attempting to apply this cleanly. I'm getting this out
before the code freeze.)
Thanks for pointing this out.  Just a note, it's really a submission 
deadline.




OK for the trunk.

Thanks!

jeff

Re: [PATCH 3/4] RISC-V: Add even/odd vec_perm_const pattern.

2024-11-17 Thread 钟居哲

+shuffle_evenodd_patterns (struct expand_vec_perm_d *d)

I prefer it rename into shuffle_even_odd_patterns


juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2024-11-17 20:53
To: gcc-patches
CC: palmer; kito.cheng; juzhe.zhong; jeffreyalaw; pan2.li; rdapp.gcc
Subject: [PATCH 3/4] RISC-V: Add even/odd vec_perm_const pattern.
From: Robin Dapp 
 
This adds handling for even/odd patterns.
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (shuffle_evenodd_patterns): New
function.
(expand_vec_perm_const_1): Use new function.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c: New test.
---
gcc/config/riscv/riscv-v.cc   |  66 ++
.../autovec/vls-vlmax/shuffle-evenodd-run.c   | 122 ++
.../rvv/autovec/vls-vlmax/shuffle-evenodd.c   |  68 ++
3 files changed, 256 insertions(+)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index d940b961bf8..4fb032af953 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3552,6 +3552,70 @@ shuffle_interleave_patterns (struct expand_vec_perm_d *d)
   return true;
}
+
+/* Recognize even/odd patterns like [0 2 4 6].  We use two compress
+   and one slideup.j  */
+
+static bool
+shuffle_evenodd_patterns (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  poly_int64 vec_len = d->perm.length ();
+  int n_patterns = d->perm.encoding ().npatterns ();
+
+  if (n_patterns != 1)
+return false;
+
+  if (!vec_len.is_constant ())
+return false;
+
+  int vlen = vec_len.to_constant ();
+  if (vlen < 4 || vlen > 64)
+return false;
+
+  if (d->one_vector_p)
+return false;
+
+  bool even = true;
+  if (!d->perm.series_p (0, 1, 0, 2))
+{
+  even = false;
+  if (!d->perm.series_p (0, 1, 1, 2))
+ return false;
+}
+
+  /* Success!  */
+  if (d->testing_p)
+return true;
+
+  machine_mode mask_mode = get_mask_mode (vmode);
+  rvv_builder builder (mask_mode, vlen, 1);
+  int bit = even ? 0 : 1;
+  for (int i = 0; i < vlen; i++)
+{
+  bit ^= 1;
+  if (bit)
+ builder.quick_push (CONST1_RTX (BImode));
+  else
+ builder.quick_push (CONST0_RTX (BImode));
+}
+  rtx mask = force_reg (mask_mode, builder.build ());
+
+  insn_code icode = code_for_pred_compress (vmode);
+  rtx ops1[] = {d->target, d->op0, mask};
+  emit_vlmax_insn (icode, COMPRESS_OP, ops1);
+
+  rtx tmp2 = gen_reg_rtx (vmode);
+  rtx ops2[] = {tmp2, d->op1, mask};
+  emit_vlmax_insn (icode, COMPRESS_OP, ops2);
+
+  rtx ops[] = {d->target, d->target, tmp2, gen_int_mode (vlen / 2, Pmode)};
+  icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
+  emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
+
+  return true;
+}
+
/* Recognize decompress patterns:
1. VEC_PERM_EXPR op0 and op1
@@ -3870,6 +3934,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
  if (shuffle_interleave_patterns (d))
return true;
+   if (shuffle_evenodd_patterns (d))
+ return true;
  if (shuffle_compress_patterns (d))
return true;
  if (shuffle_decompress_patterns (d))
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
new file mode 100644
index 000..c0760e5ed30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c
@@ -0,0 +1,122 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target riscv_v_ok } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m8 -std=gnu99" } */
+
+#include "shuffle-evenodd.c"
+
+#define SERIES_2(x, y) (x), (x + 1)
+#define SERIES_4(x, y) SERIES_2 (x, y), SERIES_2 (x + 2, y)
+#define SERIES_8(x, y) SERIES_4 (x, y), SERIES_4 (x + 4, y)
+#define SERIES_16(x, y) SERIES_8 (x, y), SERIES_8 (x + 8, y)
+#define SERIES_32(x, y) SERIES_16 (x, y), SERIES_16 (x + 16, y)
+#define SERIES_64(x, y) SERIES_32 (x, y), SERIES_32 (x + 32, y)
+
+#define comp(a, b, n)  
\
+  for (unsigned i = 0; i < n; ++i) 
\
+if ((a)[i] != (b)[i])  
\
+  __builtin_abort ();
+
+#define CHECK1(TYPE, NUNITS)   
\
+  __attribute__ ((noipa)) void check1_##TYPE ()
\
+  {
\
+TYPE v0 = (TYPE){SERIES_##NUNITS (0, NUNITS)}; 
\
+TYPE v1 = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)};
\
+TYPE ref = (TYPE){MASKE_##NUNITS (0, NUNITS)};

70 matches

Mail list logo