Re: [PATCH 0/2] Add tests to verify OpenACC clause locations

2020-11-03 Thread Thomas Schwinge
Hi!

On 2019-12-10T15:23:01+0100, Frederik Harwath  wrote:
> On 09.12.19 16:58, Harwath, Frederik wrote:
>> [use] the location of clauses in warnings instead of the location of the 
>> loop to which the clause belongs.

> Frederik Harwath (2):
>   Use clause locations in OpenACC nested reduction warnings

Basically:

-warning_at (gimple_location (stmt), 0,
+warning_at (OMP_CLAUSE_LOCATION (clause), 0,

>   Add tests to verify OpenACC clause locations

Similar changes are desirable for other directives/clauses, too.

I've just pushed "[OpenACC] More precise diagnostics for 'gang',
'worker', 'vector' clauses with arguments on 'loop' only allowed in
'kernels' regions" to master branch in commit
beddd1762ad2bbe84dd776c54489153f83f21e56, and backported to
releases/gcc-10 in commit 8d09f49006ce4c2f8d4018206c12e131c49ca6ce, see
attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
>From beddd1762ad2bbe84dd776c54489153f83f21e56 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 27 Oct 2020 17:13:16 +0100
Subject: [PATCH] [OpenACC] More precise diagnostics for 'gang', 'worker',
 'vector' clauses with arguments on 'loop' only allowed in 'kernels' regions

Instead of at the location of the 'loop' directive, 'error_at' the location of
the improper clause, and 'inform' at the location of the enclosing parent
compute construct/routine.

The Fortran testcases come with some XFAILing, to be resolved later.

	gcc/
	* omp-low.c (scan_omp_for) : More precise diagnostics for
	'gang', 'worker', 'vector' clauses with arguments only allowed in
	'kernels' regions.
	gcc/testsuite/
	* c-c++-common/goacc/pr92793-1.c: Extend.
	* gfortran.dg/goacc/pr92793-1.f90: Likewise.
---
 gcc/omp-low.c | 29 +++
 gcc/testsuite/c-c++-common/goacc/pr92793-1.c  | 37 +
 gcc/testsuite/gfortran.dg/goacc/pr92793-1.f90 | 52 +++
 3 files changed, 108 insertions(+), 10 deletions(-)

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 5392fa7e3086..de5142f979b0 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -2418,30 +2418,39 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
   if (!tgt || is_oacc_parallel_or_serial (tgt))
 	for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
 	  {
-	char const *check = NULL;
-
+	tree c_op0;
 	switch (OMP_CLAUSE_CODE (c))
 	  {
 	  case OMP_CLAUSE_GANG:
-		check = "gang";
+		c_op0 = OMP_CLAUSE_GANG_EXPR (c);
 		break;
 
 	  case OMP_CLAUSE_WORKER:
-		check = "worker";
+		c_op0 = OMP_CLAUSE_WORKER_EXPR (c);
 		break;
 
 	  case OMP_CLAUSE_VECTOR:
-		check = "vector";
+		c_op0 = OMP_CLAUSE_VECTOR_EXPR (c);
 		break;
 
 	  default:
-		break;
+		continue;
 	  }
 
-	if (check && OMP_CLAUSE_OPERAND (c, 0))
-	  error_at (gimple_location (stmt),
-			"argument not permitted on %qs clause in"
-			" OpenACC % or %", check);
+	if (c_op0)
+	  {
+		error_at (OMP_CLAUSE_LOCATION (c),
+			  "argument not permitted on %qs clause",
+			  omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
+		if (tgt)
+		  inform (gimple_location (outer_ctx->stmt),
+			  "enclosing parent compute construct");
+		else if (oacc_get_fn_attrib (current_function_decl))
+		  inform (DECL_SOURCE_LOCATION (current_function_decl),
+			  "enclosing routine");
+		else
+		  gcc_unreachable ();
+	  }
 	  }
 
   if (tgt && is_oacc_kernels (tgt))
diff --git a/gcc/testsuite/c-c++-common/goacc/pr92793-1.c b/gcc/testsuite/c-c++-common/goacc/pr92793-1.c
index d7a2ae487992..77ebb20265cf 100644
--- a/gcc/testsuite/c-c++-common/goacc/pr92793-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/pr92793-1.c
@@ -54,3 +54,40 @@ reduction(-:sum  ) /* { dg-line sum2 } */ \
   }
   }
 }
+
+
+void
+a_sl() {
+#pragma acc serial loop /* { dg-message "9: enclosing parent compute construct" } */ \
+gang(num:5) /* { dg-error "5: argument not permitted on 'gang' clause" } */ \
+  worker(num:5) /* { dg-error "3: argument not permitted on 'worker' clause" } */ \
+   vector(length:5) /* { dg-error "4: argument not permitted on 'vector' clause" } */
+  for (int i = 0; i < 10; i++)
+;
+}
+
+void
+a_s_l() {
+#pragma acc serial /* { dg-message "9: enclosing parent compute construct" } */
+  {
+#pragma acc loop \
+   gang(num:5) /* { dg-error "8: argument not permitted on 'gang' clause" } */ \
+   worker(num:5) /* { dg-error "4: argument not permitted on 'worker' clause" } */ \
+  vector(length:5) /* { dg-error "3: argument not permitted on 'vector' clause" } */
+for (int i = 0; i < 10; i++)
+  ;
+  }
+}
+
+void a_r();
+#pragma acc routine(a_r)
+
+void
+a_r() { /* { dg-message "1: enclosing routine" } */
+#pragma acc loop \
+   gang(num:5) /* { dg-error "4: argument not permitted on 'gang' clause" } */ \
+worker(num:5) /* { dg-error "5: argument not permitted on 'worker' clause" } */ \
+ 

Re: [PATCH] libstdc++: use lt_host_flags for libstdc++.la

2020-11-03 Thread Jonathan Wakely via Gcc-patches

On 16/09/20 13:16 +, JonY via Libstdc++ wrote:

For platforms like Mingw and Cygwin, cygwin refuses to generate the
shared library without using -no-undefined.

Attached patch makes sure the right flags are used, since libtool is
already used to link libstdc++.

Patch OK?


I've pushed it to trunk myself now. Thanks.



From 4ba039687182fccd67e1170f89e259e1c4a58eeb Mon Sep 17 00:00:00 2001
From: Jonathan Yong <10wa...@gmail.com>
Date: Sun, 22 Mar 2020 09:56:58 +0800
Subject: [PATCH 1/1] libstdc++: use lt_host_flags for libstdc++.la

Signed-off-by: Jonathan Yong <10wa...@gmail.com>
---
libstdc++-v3/src/Makefile.am | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libstdc++-v3/src/Makefile.am b/libstdc++-v3/src/Makefile.am
index a139adc81b3..498f533f3d3 100644
--- a/libstdc++-v3/src/Makefile.am
+++ b/libstdc++-v3/src/Makefile.am
@@ -107,7 +107,7 @@ libstdc___la_DEPENDENCIES = \
libstdc___la_LDFLAGS = \
-version-info $(libtool_VERSION) ${version_arg} -lm

-libstdc___la_LINK = $(CXXLINK) $(libstdc___la_LDFLAGS)
+libstdc___la_LINK = $(CXXLINK) $(libstdc___la_LDFLAGS) $(lt_host_flags)

# Use special rules for compatibility-ldbl.cc compilation, as we need to
# pass -mlong-double-64.
--
2.11.4.GIT








[patch] i386 tests: Add dg-require-effective-target fpic to gcc.target/i386 tests

2020-11-03 Thread Olivier Hainque
Hello,

This change is a proposal to add 

  /* { dg-require-effective-target fpic } */

to tests in gcc.target/i386 that do use -fpic or -fPIC
but don't currently query the target support.

This corresponds to what many other fpic tests do
and helps the vxWorks ports at least, as -fpic is
typically not supported in at least one of the two
major modes of such port (kernel vs RTP).

I verified that it doesn't affect testing results
on a native x86_64-linux configuration.

Ok to commit ?

Thanks in advance!

With Best Regards,

Olivier

2020-11-03  Olivier Hainque  

* gcc.target/i386/indirect-thunk-5.c: Add
dg-require-effective-target fpic.
* gcc.target/i386/indirect-thunk-6.c: Likewise.
* gcc.target/i386/indirect-thunk-extern-5.c: Likewise.
* gcc.target/i386/indirect-thunk-extern-6.c: Likewise.
* gcc.target/i386/indirect-thunk-inline-5.c: Likewise.
* gcc.target/i386/indirect-thunk-inline-6.c: Likewise.
* gcc.target/i386/noplt-gd-1.c: Likewise.
* gcc.target/i386/noplt-gd-2.c: Likewise.
* gcc.target/i386/noplt-gd-3.c: Likewise.
* gcc.target/i386/noplt-ld-1.c: Likewise.
* gcc.target/i386/noplt-ld-2.c: Likewise.
* gcc.target/i386/noplt-ld-3.c: Likewise.
* gcc.target/i386/pr45352-1.c: Likewise.
* gcc.target/i386/pr47602.c: Likewise.
* gcc.target/i386/pr55151.c: Likewise.
* gcc.target/i386/pr55458.c: Likewise.
* gcc.target/i386/pr56348.c: Likewise.
* gcc.target/i386/pr57097.c: Likewise.
* gcc.target/i386/pr65248-1.c: Likewise.
* gcc.target/i386/pr65248-2.c: Likewise.
* gcc.target/i386/pr65248-3.c: Likewise.
* gcc.target/i386/pr65248-4.c: Likewise.
* gcc.target/i386/pr65753.c: Likewise.
* gcc.target/i386/pr65915.c: Likewise.
* gcc.target/i386/pr66232-1.c: Likewise.
* gcc.target/i386/pr66232-10.c: Likewise.
* gcc.target/i386/pr66232-11.c: Likewise.
* gcc.target/i386/pr66232-12.c: Likewise.
* gcc.target/i386/pr66232-13.c: Likewise.
* gcc.target/i386/pr66232-2.c: Likewise.
* gcc.target/i386/pr66232-3.c: Likewise.
* gcc.target/i386/pr66232-4.c: Likewise.
* gcc.target/i386/pr66232-5.c: Likewise.
* gcc.target/i386/pr66334.c: Likewise.
* gcc.target/i386/pr66819-2.c: Likewise.
* gcc.target/i386/pr67215-1.c: Likewise.
* gcc.target/i386/pr67215-2.c: Likewise.
* gcc.target/i386/pr67215-3.c: Likewise.
* gcc.target/i386/pr67265.c: Likewise.
* gcc.target/i386/pr68937-1.c: Likewise.
* gcc.target/i386/pr68937-2.c: Likewise.
* gcc.target/i386/pr68937-3.c: Likewise.
* gcc.target/i386/pr68937-4.c: Likewise.
* gcc.target/i386/pr68937-5.c: Likewise.
* gcc.target/i386/pr68937-6.c: Likewise.
* gcc.target/i386/pr81481.c: Likewise.
* gcc.target/i386/pr82699-3.c: Likewise.
* gcc.target/i386/pr82699-4.c: Likewise.
* gcc.target/i386/pr83994.c: Likewise.

diff --git a/gcc/testsuite/gcc.target/i386/indirect-thunk-5.c 
b/gcc/testsuite/gcc.target/i386/indirect-thunk-5.c
index fb26c005e80d..58689b70fa21 100644
--- a/gcc/testsuite/gcc.target/i386/indirect-thunk-5.c
+++ b/gcc/testsuite/gcc.target/i386/indirect-thunk-5.c
@@ -1,4 +1,5 @@
 /* { dg-do compile { target *-*-linux* } } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-O2 -mno-indirect-branch-register -mfunction-return=keep 
-fpic -fno-plt -mindirect-branch=thunk" } */
 
 extern void bar (void);
diff --git a/gcc/testsuite/gcc.target/i386/indirect-thunk-6.c 
b/gcc/testsuite/gcc.target/i386/indirect-thunk-6.c
index 8bc45ff68ce9..f5ee97dc40c6 100644
--- a/gcc/testsuite/gcc.target/i386/indirect-thunk-6.c
+++ b/gcc/testsuite/gcc.target/i386/indirect-thunk-6.c
@@ -1,4 +1,5 @@
 /* { dg-do compile { target *-*-linux* } } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-O2 -mno-indirect-branch-register -mfunction-return=keep 
-fpic -fno-plt -mindirect-branch=thunk" } */
 
 extern void bar (void);
diff --git a/gcc/testsuite/gcc.target/i386/indirect-thunk-extern-5.c 
b/gcc/testsuite/gcc.target/i386/indirect-thunk-extern-5.c
index 53282390977d..b0696774a3e6 100644
--- a/gcc/testsuite/gcc.target/i386/indirect-thunk-extern-5.c
+++ b/gcc/testsuite/gcc.target/i386/indirect-thunk-extern-5.c
@@ -1,4 +1,5 @@
 /* { dg-do compile { target *-*-linux* } } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-O2 -mno-indirect-branch-register -mfunction-return=keep 
-fpic -fno-plt -mindirect-branch=thunk-extern" } */
 
 extern void bar (void);
diff --git a/gcc/testsuite/gcc.target/i386/indirect-thunk-extern-6.c 
b/gcc/testsuite/gcc.target/i386/indirect-thunk-extern-6.c
index 8ae43482d0cb..aa5d12deebe5 100644
--- a/gcc/testsuite/gcc.target/i386/indirect-thunk-extern-6.c
+++ b/gcc/testsuite/gcc.target/i386/indirect-thunk-extern-6.c
@@ -1,4 +1,5 @@
 /* { dg-do compile { target *-

Re: [PATCH] Add missing gnu-versioned-namespace symbols

2020-11-03 Thread Jonathan Wakely via Gcc-patches

On 02/11/20 21:52 +0100, François Dumont via Libstdc++ wrote:

On 02/11/20 3:17 pm, Jonathan Wakely wrote:

On 01/11/20 20:48 +0100, François Dumont via Libstdc++ wrote:

Several tests are failing because of those missing symbols.

I understand why we need to export symbols relying in the 
versioned namespace but I don't understand why we need to do it 
for _GLIBCXX_DEBUG symbols which are not version namespace 
dependant.


If you don't export the symbol, it can't be found by code linking to
libstdc++.so.8


So I understand that in versioned namespace mode only 
gnu-versioned-namespace.ver is being used and not gnu.ver.


Right.



This linker script is the only one used to build libstdc++.so.8 so all
symbols that need to be exported by that library have to be exported
by this script. Nothing exports that debug symbol unless you add it
here.

What I don't understand is why the __istream_extract symbol isn't
matched by the wildcard in the extern "C++" block at the top of the
file.


Maybe for the same reason that the std::__copy_streambufs before this 
one and some others symbols in std::__8 had to be explicitely exported 
too.


But I don't know it.


Yeah, I don't understand those either.

OK for trunk anyway. I'll investigate another day.




Re: [PATCH] Fix column information for omp_clauses in Fortran code

2020-11-03 Thread Thomas Schwinge
Hi!

On 2019-12-09T16:58:44+0100, "Harwath, Frederik"  
wrote:
> Tobias has recently fixed a problem with the column information in gfortran 
> locations
> ("PR 92793 - fix column used for error diagnostic").


In context of:

> [use] the location of clauses in warnings instead of the location of the loop 
> to which the clause belongs.

..., Frederik then did:

> Subject: [PATCH] Fix column information for omp_clauses in Fortran code
>
> The location of all OpenMP/OpenACC clauses on any given line in Fortran code
> always points to the first clause on that line. Hence, the column information
> is wrong for all clauses but the first one.
>
> Use the correct location for each clause instead.

Actually, that was specific for 'reduction' clauses:

> --- a/gcc/fortran/trans-openmp.c
> +++ b/gcc/fortran/trans-openmp.c
> @@ -1982,7 +1982,7 @@ gfc_trans_omp_reduction_list (gfc_omp_namelist 
> *namelist, tree list,
>   tree t = gfc_trans_omp_variable (namelist->sym, false);
>   if (t != error_mark_node)
> {
> - tree node = build_omp_clause (gfc_get_location (&where),
> + tree node = build_omp_clause (gfc_get_location (&namelist->where),
> OMP_CLAUSE_REDUCTION);

Similar changes are desirable for other directives/clauses, too.

I've just pushed "[Fortran] More precise location information for OpenACC
'gang', 'worker', 'vector' clauses with argument [PR92793]" to master
branch in commit 41f7f6178e2d35288273656dc55dae8fcf3edeb5, and backported
to releases/gcc-10 in commit 5ceaf8a54abb3f9bd3c268fe420999a7962b2a15,
see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
>From 41f7f6178e2d35288273656dc55dae8fcf3edeb5 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 27 Oct 2020 17:14:10 +0100
Subject: [PATCH] [Fortran] More precise location information for OpenACC
 'gang', 'worker', 'vector' clauses with argument [PR92793]

	gcc/fortran/
	PR fortran/92793
	* trans-openmp.c (gfc_trans_omp_clauses): More precise location
	information for OpenACC 'gang', 'worker', 'vector' clauses with
	argument.
	gcc/testsuite/
	PR fortran/92793
	* gfortran.dg/goacc/pr92793-1.f90: Adjust.
---
 gcc/fortran/trans-openmp.c| 40 ---
 gcc/testsuite/gfortran.dg/goacc/pr92793-1.f90 | 29 +-
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index d02949ecbe4a..1d652a09f9d2 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -3771,34 +3771,38 @@ gfc_trans_omp_clauses (stmtblock_t *block, gfc_omp_clauses *clauses,
 }
   if (clauses->vector)
 {
+  c = build_omp_clause (gfc_get_location (&where), OMP_CLAUSE_VECTOR);
+  omp_clauses = gfc_trans_add_clause (c, omp_clauses);
+
   if (clauses->vector_expr)
 	{
 	  tree vector_var
 	= gfc_convert_expr_to_tree (block, clauses->vector_expr);
-	  c = build_omp_clause (gfc_get_location (&where), OMP_CLAUSE_VECTOR);
 	  OMP_CLAUSE_VECTOR_EXPR (c) = vector_var;
-	  omp_clauses = gfc_trans_add_clause (c, omp_clauses);
-	}
-  else
-	{
-	  c = build_omp_clause (gfc_get_location (&where), OMP_CLAUSE_VECTOR);
-	  omp_clauses = gfc_trans_add_clause (c, omp_clauses);
+
+	  /* TODO: We're not capturing location information for individual
+	 clauses.  However, if we have an expression attached to the
+	 clause, that one provides better location information.  */
+	  OMP_CLAUSE_LOCATION (c)
+	= gfc_get_location (&clauses->vector_expr->where);
 	}
 }
   if (clauses->worker)
 {
+  c = build_omp_clause (gfc_get_location (&where), OMP_CLAUSE_WORKER);
+  omp_clauses = gfc_trans_add_clause (c, omp_clauses);
+
   if (clauses->worker_expr)
 	{
 	  tree worker_var
 	= gfc_convert_expr_to_tree (block, clauses->worker_expr);
-	  c = build_omp_clause (gfc_get_location (&where), OMP_CLAUSE_WORKER);
 	  OMP_CLAUSE_WORKER_EXPR (c) = worker_var;
-	  omp_clauses = gfc_trans_add_clause (c, omp_clauses);
-	}
-  else
-	{
-	  c = build_omp_clause (gfc_get_location (&where), OMP_CLAUSE_WORKER);
-	  omp_clauses = gfc_trans_add_clause (c, omp_clauses);
+
+	  /* TODO: We're not capturing location information for individual
+	 clauses.  However, if we have an expression attached to the
+	 clause, that one provides better location information.  */
+	  OMP_CLAUSE_LOCATION (c)
+	= gfc_get_location (&clauses->worker_expr->where);
 	}
 }
   if (clauses->gang)
@@ -3806,11 +3810,19 @@ gfc_trans_omp_clauses (stmtblock_t *block, gfc_omp_clauses *clauses,
   tree arg;
   c = build_omp_clause (gfc_get_location (&where), OMP_CLAUSE_GANG);
   omp_clauses = gfc_trans_add_clause (c, omp_clauses);
+
   if (clauses->gang_num_expr)
 	{
 	  arg = gfc_convert_expr_to_tree (block, clauses->gang_n

Re: [patch] i386 tests: Add dg-require-effective-target fpic to gcc.target/i386 tests

2020-11-03 Thread Jakub Jelinek via Gcc-patches
On Tue, Nov 03, 2020 at 09:25:03AM +0100, Olivier Hainque wrote:
> Hello,
> 
> This change is a proposal to add 
> 
>   /* { dg-require-effective-target fpic } */
> 
> to tests in gcc.target/i386 that do use -fpic or -fPIC
> but don't currently query the target support.
> 
> This corresponds to what many other fpic tests do
> and helps the vxWorks ports at least, as -fpic is
> typically not supported in at least one of the two
> major modes of such port (kernel vs RTP).

70% of the tests you've changed have a target *-linux* or similar
right above that line, what is the point of adding the fpic
effective targets to those?  Those surely aren't run on vxWorks
and on x86 Linux fpic is always supported.

No objection to adding it to the rest.

Jakub



Re: [og8] Report errors on missing OpenACC reduction clauses in nested reductions

2020-11-03 Thread Thomas Schwinge
Hi!

On 2018-12-20T15:28:33+0100, I wrote:
> On behalf of Gergő (who doesn't have write access yet) I've pushed the
> attached to openacc-gcc-8-branch.

(Which then eventually got into master branch via Frederik.)

> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/goacc/nested-reductions-fail.c
> @@ -0,0 +1,492 @@
> +/* Test erroneous cases of nested reduction loops.  */
> +
> +void acc_parallel (void)
> +{
> +  int i, j, k, l, sum, diff;
> +
> +  #pragma acc parallel
> +  {
> +#pragma acc loop reduction(+:sum)
> +for (i = 0; i < 10; i++)
> +  #pragma acc loop // { dg-error "nested loop in reduction needs 
> reduction clause for .sum." }
> +  for (j = 0; j < 10; j++)
> +#pragma acc loop reduction(+:sum)
> +for (k = 0; k < 10; k++)
> +  sum = 1;

> +void acc_kernels (void)
> +{
> +  int i, j, k, sum, diff;
> +
> +  /* FIXME:  No diagnostics are produced for these loops because reductions
> + in kernels regions are not supported yet.  */
> +  #pragma acc kernels
> +  {
> +#pragma acc loop reduction(+:sum)
> +for (i = 0; i < 10; i++)
> +  for (j = 0; j < 10; j++)
> +for (k = 0; k < 10; k++)
> +  sum = 1;

For ongoing maintenance, I find it easier if such repetitive testing
(here: to cover all different compute constructs and 'routine') is split
into separate files: easy to diff, etc.

> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/goacc/nested-reductions.c
> @@ -0,0 +1,420 @@
> +/* Test cases of nested reduction loops that should compile cleanly.  */

Likewise.

I've pushed "[OpenACC] Split up testcases for inconsistent nested
'reduction' clauses checking" to master branch in commit
fedf3e94efe774b8c0539d344130a7b25f50a881, and backported to
releases/gcc-10 branch in commit
eeeb6833d2c6bcc0e675928f17a75efb41eeaf13, see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
>From fedf3e94efe774b8c0539d344130a7b25f50a881 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Thu, 22 Oct 2020 09:45:31 +0200
Subject: [PATCH] [OpenACC] Split up testcases for inconsistent nested
 'reduction' clauses checking

	gcc/testsuite/
	* c-c++-common/goacc/nested-reductions.c: Split file into...
	* c-c++-common/goacc/nested-reductions-1-kernels.c: ... this...
	* c-c++-common/goacc/nested-reductions-1-parallel.c: ..., this...
	* c-c++-common/goacc/nested-reductions-1-routine.c: ..., and this.
	* c-c++-common/goacc/nested-reductions-warn.c: Split file into...
	* c-c++-common/goacc/nested-reductions-2-kernels.c: ... this...
	* c-c++-common/goacc/nested-reductions-2-parallel.c: ..., this...
	* c-c++-common/goacc/nested-reductions-2-routine.c: ..., and this.
	* gfortran.dg/goacc/nested-reductions.f90: Split file into...
	* gfortran.dg/goacc/nested-reductions-1-kernels.f90: ... this...
	* gfortran.dg/goacc/nested-reductions-1-parallel.f90: ..., this...
	* gfortran.dg/goacc/nested-reductions-1-routine.f90: ..., and
	this.
	* gfortran.dg/goacc/nested-reductions-warn.f90: Split file into...
	* gfortran.dg/goacc/nested-reductions-2-kernels.f90: ... this...
	* gfortran.dg/goacc/nested-reductions-2-parallel.f90: ..., this...
	* gfortran.dg/goacc/nested-reductions-2-routine.f90: ..., and
	this.
---
 .../goacc/nested-reductions-1-kernels.c   |  41 
 ...tions.c => nested-reductions-1-parallel.c} | 108 +--
 .../goacc/nested-reductions-1-routine.c   |  68 +++
 .../goacc/nested-reductions-2-kernels.c   |  50 +
 ...-warn.c => nested-reductions-2-parallel.c} | 142 +-
 .../goacc/nested-reductions-2-routine.c   |  93 +
 .../goacc/nested-reductions-1-kernels.f90 |  55 ++
 ...s.f90 => nested-reductions-1-parallel.f90} | 142 +-
 .../goacc/nested-reductions-1-routine.f90 |  88 +
 .../goacc/nested-reductions-2-kernels.f90 |  63 ++
 ...n.f90 => nested-reductions-2-parallel.f90} | 181 +-
 .../goacc/nested-reductions-2-routine.f90 | 119 
 12 files changed, 589 insertions(+), 561 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/nested-reductions-1-kernels.c
 rename gcc/testsuite/c-c++-common/goacc/{nested-reductions.c => nested-reductions-1-parallel.c} (76%)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/nested-reductions-1-routine.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/nested-reductions-2-kernels.c
 rename gcc/testsuite/c-c++-common/goacc/{nested-reductions-warn.c => nested-reductions-2-parallel.c} (78%)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/nested-reductions-2-routine.c
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/nested-reductions-1-kernels.f90
 rename gcc/testsuite/gfortran.dg/goacc/{nested-reductions.f90 => nested-reductions-1-parallel.f90} (75%)
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/nested-reductions-1-routine.f90
 create 

Re: [Patch, fortran] PR83118 - [8/9/10/11 Regression] Bad intrinsic assignment of class(*) array component of derived type

2020-11-03 Thread Paul Richard Thomas via Gcc-patches
Ping!

On Thu, 29 Oct 2020 at 15:59, Paul Richard Thomas <
paul.richard.tho...@gmail.com> wrote:

> Hi Everyone,
>
> I am afraid that this is a rather long sad story, mainly due to my efforts
> with gfortran being interrupted by daytime work. I posted the first version
> of the patch nearly a year ago but this was derailed by Tobias's question
> at: https://gcc.gnu.org/legacy-ml/fortran/2019-11/msg00098.html
>
> (i) The attached fixes the original problem and is tested by
> gfortran.dg/unlimited_polymorphic_32.f03.
> (ii) In fixing the original problem, a fair amount of effort was required
> to get the element length correct for class temporaries produced by
> dependencies in class assignment (see footnote). This is reflected in the
> changes to trans_array.c(gfc_alloc_allocatable_for_assignment).
> (iii) Tobias's testcase in the above posting to the list didn't address
> itself to class arrays of the original problem. However, it revealed that
> reallocation was not occuring at all for scalar assignments.  This is fixed
> by the large chunk in trans-expr.c(trans_class_assignment). The array case
> is 'fixed' by testing for unequal element sizes between lhs and rhs before
> reallocation in gfc_alloc_allocatable_for_assignment. This is difficult to
> test for since, in most cases, the system returns that same address after
> reallocation.
> (iv) dependency_57.f90 segfaulted at runtime. The other work in
> trans_class_assignment was required to fix this.
> (v) A number of minor tidy ups were done including the new function
> gfc_resize_class_size_with_len to eliminate some repeated code.
>
> This all bootstraps and regtests on FC31/x86_64 - OK for master?
>
> Cheers
>
> Paul
>
> This patch fixes PR83118 and fixes one or two other niggles in handling
> class objects - most importantly class array temporaries required, where
> dependences occur in class assignment, and a correct implementation of
> reallocation on assignment.
>
> 2020-10-29  Paul Thomas  
>
> gcc/fortran
> PR fortran/83118
> * resolve.c (resolve_ordinary_assign): Generate a vtable if
> necessary for scalar non-polymorphic rhs's to unlimited lhs's.
> * trans-array.c (gfc_trans_allocate_array_storage): Defer
> obtaining class element type until all sources of class exprs.
> are tried. Use class API rather than TREE_OPERAND. Look for
> class expressions in ss->info. After this, obtain the element
> size for class payloads. Cast the data as character(len=size)
> to overcome unlimited polymorphic problems.
> (structure_alloc_comps): Replace code that replicates the new
> function gfc_resize_class_size_with_len.
> (gfc_alloc_allocatable_for_assignment): Obtain element size
> for lhs in cases of deferred characters and class enitities.
> Move code for the element size of rhs to start of block. Clean
> up extraction of class parmateres throughout this function.
> After the shape check test whether or not the lhs and rhs
> element sizes are the same. Use earlier evaluation of
> 'cond_null'. Reallocation of lhs only to happen if siz changes
> or element size changes.
> * trans-expr.c (gfc_resize_class_size_with_len): New function.
> (gfc_conv_procedure_call): Ensure the vtable is present for
> passing a non-class actual to an unlimited formal.
> (trans_class_vptr_len_assignment): For expressions of type
> BT_CLASS, extract the class expression if necessary. Use a
> statement block outside the loop body. Ensure that 'rhs' is
> of the correct type. Obtain rhs vptr in all circumstances.
> (gfc_trans_assignment_1): Simplify some of the logic with
> 'realloc_flag'. Set 'vptr_copy' for all array assignments to
> unlimited polymorphic lhs.
> * trans-c (gfc_build_array_ref): Call gfc_resize_class_size_
> with_len to correct span for unlimited polymorphic decls.
> * trans.h : Add prototype for gfc_resize_class_size_with_len.
>
> gcc/testsuite/
> PR fortran/83118
> * gfortran.dg/dependency_57.f90: Change to dg-run and test
> for correct result.
> * gfortran.dg/unlimited_polymorphic_32.f03: New test.
>
> Footnote: I have come to the conclusion that
> gfc_trans_allocate_array_storage is the last place that we should be
> dealing with class array temporaries, or directly at least. I will give
> some thought as to how to do it better. Also, chunks of code are coming
> within scalarization loops that should be outside:
>   x->_vptr = (struct __vtype__STAR * {ref-all})
> &__vtab_INTEGER_4_;
>   x->_len = 0;
>   D.3977 = x->_vptr->_size;
>   D.3978 = x->_len;
>   D.3979 = D.3978 > 0 ? D.3977 * D.3978 : D.3977;
>
>
>

-- 
"If you can't explain it simply, you don't understand it well enough" -
Albert Einstein


Re: [Patch] Fortran: Add !GCC$ attributes DEPRECATED

2020-11-03 Thread Paul Richard Thomas via Gcc-patches
Hi Tobias,

That looks to be the best that can be done with a sensible amount of
effort. OK by me.

Regards

Paul


On Mon, 2 Nov 2020 at 19:09, Tobias Burnus  wrote:

> This adds the Fortran equivalent to __attribute__((deprecated)),
> except that "deprecated(message)" is not supported and that only
> procedures + variables (and parameters) are supported and not
> types.
>
> The issue came up with OpenMP,
> cf. https://gcc.gnu.org/pipermail/gcc-patches/2020-October/557359.html
>
> OK?
>
> Tobias
>
> PS: The location for parameter is not ideal as they are resolved
> too early; it can be improved, e.g., by moving to match_variable;
> but I am not sure it is worth doing so.
>
> -
> Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München /
> Germany
> Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung,
> Alexander Walter
>


-- 
"If you can't explain it simply, you don't understand it well enough" -
Albert Einstein


Re: [og8] Report errors on missing OpenACC reduction clauses in nested reductions

2020-11-03 Thread Thomas Schwinge
Hi!

On 2020-11-03T09:39:54+0100, I wrote:
> On 2018-12-20T15:28:33+0100, I wrote:
>> On behalf of Gergő (who doesn't have write access yet) I've pushed the
>> attached to openacc-gcc-8-branch.
>
> (Which then eventually got into master branch via Frederik.)

>> --- /dev/null
>> +++ b/gcc/testsuite/c-c++-common/goacc/nested-reductions-fail.c
>> @@ -0,0 +1,492 @@
>> +/* Test erroneous cases of nested reduction loops.  */
>> +
>> +void acc_parallel (void)
>> +{
>> +  int i, j, k, l, sum, diff;
>> +
>> +  #pragma acc parallel
>> +  {
>> +#pragma acc loop reduction(+:sum)
>> +for (i = 0; i < 10; i++)
>> +  #pragma acc loop // { dg-error "nested loop in reduction needs 
>> reduction clause for .sum." }
>> +  for (j = 0; j < 10; j++)
>> +#pragma acc loop reduction(+:sum)
>> +for (k = 0; k < 10; k++)
>> +  sum = 1;
>
>> +void acc_kernels (void)
>> +{
>> +  int i, j, k, sum, diff;
>> +
>> +  /* FIXME:  No diagnostics are produced for these loops because reductions
>> + in kernels regions are not supported yet.  */
>> +  #pragma acc kernels
>> +  {
>> +#pragma acc loop reduction(+:sum)
>> +for (i = 0; i < 10; i++)
>> +  for (j = 0; j < 10; j++)
>> +for (k = 0; k < 10; k++)
>> +  sum = 1;

Getting these diagnostics consistent is easy enough, however.  I've
pushed "[OpenACC] Enable inconsistent nested 'reduction' clauses checking
for OpenACC 'kernels'" to master branch in commit
64dc14b1a764bd3059170431c9b43c6192dbd48f, and backported to
releases/gcc-10 branch in commit
217fb4d4e59e7d6e03a3704f80f401e2a641dbe5, see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
>From 64dc14b1a764bd3059170431c9b43c6192dbd48f Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Thu, 22 Oct 2020 11:04:22 +0200
Subject: [PATCH] [OpenACC] Enable inconsistent nested 'reduction' clauses
 checking for OpenACC 'kernels'

	gcc/
	* omp-low.c (scan_omp_for) : Move earlier inconsistent
	nested 'reduction' clauses checking.
	gcc/testsuite/
	* c-c++-common/goacc/nested-reductions-1-kernels.c: Extend.
	* c-c++-common/goacc/nested-reductions-2-kernels.c: Likewise.
	* gfortran.dg/goacc/nested-reductions-1-kernels.f90: Likewise.
	* gfortran.dg/goacc/nested-reductions-2-kernels.f90: Likewise.
---
 gcc/omp-low.c |  36 +-
 .../goacc/nested-reductions-1-kernels.c   | 199 +-
 .../goacc/nested-reductions-2-kernels.c   | 271 +-
 .../goacc/nested-reductions-1-kernels.f90 | 251 -
 .../goacc/nested-reductions-2-kernels.f90 | 346 +-
 5 files changed, 1063 insertions(+), 40 deletions(-)

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index de5142f979b0..2f1a544bd46e 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -2454,23 +2454,7 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
 	  }
 
   if (tgt && is_oacc_kernels (tgt))
-	{
-	  /* Strip out reductions, as they are not handled yet.  */
-	  tree *prev_ptr = &clauses;
-
-	  while (tree probe = *prev_ptr)
-	{
-	  tree *next_ptr = &OMP_CLAUSE_CHAIN (probe);
-	  
-	  if (OMP_CLAUSE_CODE (probe) == OMP_CLAUSE_REDUCTION)
-		*prev_ptr = *next_ptr;
-	  else
-		prev_ptr = next_ptr;
-	}
-
-	  gimple_omp_for_set_clauses (stmt, clauses);
-	  check_oacc_kernel_gwv (stmt, ctx);
-	}
+	check_oacc_kernel_gwv (stmt, ctx);
 
   /* Collect all variables named in reductions on this loop.  Ensure
 	 that, if this loop has a reduction on some variable v, and there is
@@ -2553,6 +2537,24 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
   ctx->outer_reduction_clauses
 	= chainon (unshare_expr (ctx->local_reduction_clauses),
 		   ctx->outer_reduction_clauses);
+
+  if (tgt && is_oacc_kernels (tgt))
+	{
+	  /* Strip out reductions, as they are not handled yet.  */
+	  tree *prev_ptr = &clauses;
+
+	  while (tree probe = *prev_ptr)
+	{
+	  tree *next_ptr = &OMP_CLAUSE_CHAIN (probe);
+
+	  if (OMP_CLAUSE_CODE (probe) == OMP_CLAUSE_REDUCTION)
+		*prev_ptr = *next_ptr;
+	  else
+		prev_ptr = next_ptr;
+	}
+
+	  gimple_omp_for_set_clauses (stmt, clauses);
+	}
 }
 
   scan_sharing_clauses (clauses, ctx);
diff --git a/gcc/testsuite/c-c++-common/goacc/nested-reductions-1-kernels.c b/gcc/testsuite/c-c++-common/goacc/nested-reductions-1-kernels.c
index 68cb8f82ee57..9323e2c8d7e3 100644
--- a/gcc/testsuite/c-c++-common/goacc/nested-reductions-1-kernels.c
+++ b/gcc/testsuite/c-c++-common/goacc/nested-reductions-1-kernels.c
@@ -6,8 +6,6 @@ void acc_kernels (void)
 {
   int i, j, k, sum, diff;
 
-  /* FIXME:  These tests are not meaningful yet because reductions in
- kernels regions are not supported yet.  */
   #pragma acc kernels
   {
 #pragma acc loop reduction(+:sum)
@@ -16,6 +14,12 @@ void acc_kernels (void)
 for (k = 0; k < 10;

[patch] Add dg-require-effective-target fpic to gcc.target/powerpc tests

2020-11-03 Thread Olivier Hainque
Hello,

This change is a proposal to add 

 /* { dg-require-effective-target fpic } */

to a few tests in gcc.target/powerpc that do use
-fpic or -fPIC but don't currently query the target
support.

This corresponds to what many other fpic tests do
and helps the vxWorks ports at least, as -fpic is
typically not supported in at least one of the two
major modes of such port (kernel vs RTP).

Ok to commit?

Thanks in advance!

Best regards,

Olivier

2020-11-03  Olivier Hainque  

gcc/testsuite/
* gcc.target/powerpc/pr67789.c: Add
dg-require-effective-target fpic.
* gcc.target/powerpc/pr83629.c: Likewise.
* gcc.target/powerpc/pr84112.c: Likewise.

diff --git a/gcc/testsuite/gcc.target/powerpc/pr67789.c 
b/gcc/testsuite/gcc.target/powerpc/pr67789.c
index 371d7a3d8ede..05d01ef20d77 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr67789.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr67789.c
@@ -1,4 +1,5 @@
 /* { dg-do assemble } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-O2 -msecure-plt -fPIC" } */
 /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr83629.c 
b/gcc/testsuite/gcc.target/powerpc/pr83629.c
index 250378ec485c..976b564e927d 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr83629.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr83629.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target ilp32 } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-O2 -fPIC -frename-registers 
--param=sched-autopref-queue-depth=0 -mdejagnu-cpu=603" } */
 
 extern void bar (void *);
diff --git a/gcc/testsuite/gcc.target/powerpc/pr84112.c 
b/gcc/testsuite/gcc.target/powerpc/pr84112.c
index cd429df41a0a..c606f5b98552 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr84112.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr84112.c
@@ -1,4 +1,5 @@
 /* { dg-do compile { target powerpc*-*-* } }*/
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-mdejagnu-cpu=power8 -O3 -fstack-protector-strong -fpic" } */
 
 char *b;
-- 
2.17.1



[PATCH] testsuite/97688 - fix check_vect () with __AVX2__

2020-11-03 Thread Richard Biener
This fixes the cpuid check to always specify a subleaf zero
which is required to detect AVX2 and doesn't hurt for level one.
Without this fix we get zero runtime coverage when -mavx2 is
specified.

Tested on x86_64-unknown-linux-gnu.

2020-11-03  Richard Biener  

PR testsuite/97688
* gcc.dg/vect/tree-vect.h (check_vect): Fix the x86 cpuid
check to always specify subleaf zero.
---
 gcc/testsuite/gcc.dg/vect/tree-vect.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/tree-vect.h 
b/gcc/testsuite/gcc.dg/vect/tree-vect.h
index 5d8d9eba3f8..c4b81441216 100644
--- a/gcc/testsuite/gcc.dg/vect/tree-vect.h
+++ b/gcc/testsuite/gcc.dg/vect/tree-vect.h
@@ -52,7 +52,7 @@ check_vect (void)
 want_level = 1, want_d = bit_SSE2;
 # endif
 
-if (!__get_cpuid (want_level, &a, &b, &c, &d)
+if (!__get_cpuid_count (want_level, 0, &a, &b, &c, &d)
|| ((b & want_b) | (c & want_c) | (d & want_d)) == 0)
   exit (0);
   }
-- 
2.26.2


Re: [PATCH] Fix PR97205

2020-11-03 Thread Richard Biener
On Mon, 2 Nov 2020, Bernd Edlinger wrote:

> On 11/2/20 3:07 PM, Richard Biener wrote:
> > On Mon, 2 Nov 2020, Bernd Edlinger wrote:
> > 
> >> Hi,
> >>
> >> this makes sure that stack allocated SSA_NAMEs are
> >> at least MODE_ALIGNED.  Also increase the MEM_ALIGN
> >> for the corresponding rtl objects.
> >>
> >>
> >> Tested on x86_64-pc-linux-gnu and arm-none-eabi.
> >>
> >> OK for trunk?
> > 
> > 
> > @@ -1022,6 +1030,14 @@ expand_one_stack_var_at (tree decl, rtx base,
> > unsigned base_align,
> >  }
> >  
> >set_rtl (decl, x);
> > +
> > +  if (TREE_CODE (decl) == SSA_NAME
> > +  && GET_MODE (x) != BLKmode
> > +  && MEM_ALIGN (x) < GET_MODE_ALIGNMENT (GET_MODE (x)))
> > +{
> > +  gcc_checking_assert (GET_MODE_ALIGNMENT (GET_MODE (x)) <=
> > base_align);
> > +  set_mem_align (x, GET_MODE_ALIGNMENT (GET_MODE (x)));
> > +}
> >  }
> >  
> > 
> > I wonder whether we cannot "fix" set_rtl to not call
> > set_mem_attributes in this path, maybe directly call
> > set_mem_align there instead?  That is, the preceeding
> > code for ! SSA_NAME already tries to adjust alignment
> > to honor that of the actual stack slot - IMHO the
> > non-SSA and SSA cases should be merged after this
> > patch, but maybe simply by calling set_mem_align
> > instead of doing the DECL_ALIGN frobbing and do
> > the alignment compute also for SSA_NAMEs?
> > 
> > The other pieces look OK but the above is a bit ugly
> > at the moment.
> > 
> 
> Hmm, how about this?

That would work for me.  Guess removing the DECL_ALIGN frobbing
in the != SSA_NAME path didn't work out or you didn't try out
of caution?

Richard.

> --- a/gcc/cfgexpand.c
> +++ b/gcc/cfgexpand.c
> @@ -1007,20 +1007,21 @@ expand_one_stack_var_at (tree decl, rtx base, 
> unsigned base_align,
>x = plus_constant (Pmode, base, offset);
>x = gen_rtx_MEM (TREE_CODE (decl) == SSA_NAME
>? TYPE_MODE (TREE_TYPE (decl))
> -  : DECL_MODE (SSAVAR (decl)), x);
> +  : DECL_MODE (decl), x);
> +
> +  /* Set alignment we actually gave this decl if it isn't an SSA name.
> + If it is we generate stack slots only accidentally so it isn't as
> + important, we'll simply set the alignment directly on the MEM_P.  */
> +
> +  if (base == virtual_stack_vars_rtx)
> +offset -= frame_phase;
> +  align = known_alignment (offset);
> +  align *= BITS_PER_UNIT;
> +  if (align == 0 || align > base_align)
> +align = base_align;
>  
>if (TREE_CODE (decl) != SSA_NAME)
>  {
> -  /* Set alignment we actually gave this decl if it isn't an SSA name.
> - If it is we generate stack slots only accidentally so it isn't as
> -important, we'll simply use the alignment that is already set.  */
> -  if (base == virtual_stack_vars_rtx)
> -   offset -= frame_phase;
> -  align = known_alignment (offset);
> -  align *= BITS_PER_UNIT;
> -  if (align == 0 || align > base_align)
> -   align = base_align;
> -
>/* One would think that we could assert that we're not decreasing
>  alignment here, but (at least) the i386 port does exactly this
>  via the MINIMUM_ALIGNMENT hook.  */
> @@ -1031,13 +1032,7 @@ expand_one_stack_var_at (tree decl, rtx base, unsigned 
> base_align,
>  
>set_rtl (decl, x);
>  
> -  if (TREE_CODE (decl) == SSA_NAME
> -  && GET_MODE (x) != BLKmode
> -  && MEM_ALIGN (x) < GET_MODE_ALIGNMENT (GET_MODE (x)))
> -{
> -  gcc_checking_assert (GET_MODE_ALIGNMENT (GET_MODE (x)) <= base_align);
> -  set_mem_align (x, GET_MODE_ALIGNMENT (GET_MODE (x)));
> -}
> +  set_mem_align (x, align);
>  }
>  
>  class stack_vars_data
> 
> 
> Is it OK if it passes bootstrap and regtesting ?
> 
> Thanks
> Bernd.
> 
> > Thanks,
> > Richard,
> > 
> >>
> >>
> >> Thanks
> >> Bernd.
> >>
> > 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend


RE: [PATCH] aarch64: Add backend support for expanding __builtin_memset

2020-11-03 Thread Sudakshina Das via Gcc-patches
Hi Richard

> -Original Message-
> From: Richard Sandiford 
> Sent: 30 October 2020 19:56
> To: Sudakshina Das 
> Cc: Wilco Dijkstra ; gcc-patches@gcc.gnu.org;
> Kyrylo Tkachov ; Richard Earnshaw
> 
> Subject: Re: [PATCH] aarch64: Add backend support for expanding
> __builtin_memset
> 
> > +  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));  dst =
> > + adjust_automodify_address (dst, VOIDmode, base, 0);
> > +
> > +  /* Prepare the val using a DUP v0.16B, val.  */  if (CONST_INT_P
> > + (val))
> > +{
> > +  val = force_reg (QImode, val);
> > +}
> > +  src = gen_reg_rtx (V16QImode);
> > +  emit_insn (gen_aarch64_simd_dupv16qi(src, val));
> 
> I think we should use:
> 
>   src = expand_vector_broadcast (V16QImode, val);
> 
> here (without the CONST_INT_P check), so that for constants we just move a
> constant directly into a register.
>

Sorry to bring this up again. When I tried expand_vector_broadcast, I 
see the following behaviour:
for __builtin_memset(p, 1, 24) where the duplicated constant fits
moviv0.16b, 0x1
mov x1, 72340172838076673
str x1, [x0, 16]
str q0, [x0]
and an ICE for __builtin_memset(p, 1, 32) where I am guessing the duplicated
constant does not fit
x.c:7:30: error: unrecognizable insn:
7 | { __builtin_memset(p, 1, 32);}
  |  ^
(insn 8 7 0 2 (parallel [
(set (mem:V16QI (reg:DI 94) [0 MEM  [(void *)p_2(D)]+0 
S16 A8])
(const_vector:V16QI [
(const_int 1 [0x1]) repeated x16
]))
(set (mem:V16QI (plus:DI (reg:DI 94)
(const_int 16 [0x10])) [0 MEM  [(void 
*)p_2(D)]+16 S16 A8])
(const_vector:V16QI [
(const_int 1 [0x1]) repeated x16
]))
]) "x.c":7:3 -1
 (nil))
during RTL pass: vregs

> Sudakshina Das  writes:
> >> > +
> >> > +  /* "Cast" the *dst to the correct mode.  */  *dst =
> >> > + adjust_address (*dst, mode, 0);
> >> > +  /* Emit the memset.  */
> >> > +  emit_move_insn (*dst, reg);
> >> > +  /* Move the pointer forward.  */  *dst =
> >> > + aarch64_progress_pointer (*dst); }
> >> > +
> >> > +/* Expand setmem, as if from a __builtin_memset.  Return true if
> >> > +   we succeed, otherwise return false.  */
> >> > +
> >> > +bool
> >> > +aarch64_expand_setmem (rtx *operands) {
> >> > +  int n, mode_bits;
> >> > +  unsigned HOST_WIDE_INT len;
> >> > +  rtx dst = operands[0];
> >> > +  rtx val = operands[2], src;
> >> > +  rtx base;
> >> > +  machine_mode cur_mode = BLKmode, next_mode;
> >> > +  bool speed_p = !optimize_function_for_size_p (cfun);
> >> > +  unsigned max_set_size = speed_p ? 256 : 128;
> >>
> >> What's the basis for the size value?  AIUI (and I've probably got
> >> this wrong), that effectively means a worst case of 3+2 stores
> >> (3 STP Qs and 2 mop-up stores).  Then we need one instruction to set
> >> up the constant.  So if that's right, it looks like the worst-case size is 
> >> 6
> instructions.
> >>
> >> AARCH64_CALL_RATIO has a value of 8, but I'm not sure how that
> >> relates to the number of instructions in a call.  I guess the best
> >> case is 4 (3 instructions for the parameters and one for the call itself).
> >>
> >
> > This one I will ask Wilco to chime in. We discussed offline what would
> > be the largest case that this builtin should allow and he suggested
> > 256-bytes. It would actually generate 9 instructions (its in the memset-
> corner-case.c).
> > Personally I am not sure what the best decisions are in this case so I
> > will rely on Wilco's suggestions.
> 
> Ah, sorry, by “the size value”, I meant the !speed_p value of 128.
> I now realise that that was far from clear given that the variable is called
> max_set_size :-)
> 
> So yeah, I'm certainly not questioning the speed_p value of 256.
> I'm sure you and Wilco have picked the best value for that.  But -Os stuff can
> usually be justified on first principles and I wasn't sure where the value of 
> 128
> came from.
>

I had another chat with Wilco about the 128byte value for !speed_p. We
estimate the average number of instructions upto 128byte would be ~3 which
is similar to do a memset call. But I did go back and think about the tuning
argument of  AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS a bit more because
you are right that based on that the average instructions can become double.
I would propose using 256/128 based on speed_p but halving the value based on 
the
tune parameter. Obviously the assumption here is that we are respecting the 
core's
choice of avoiding stp of q registers (given that I do not see other uses of
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS being changed by -Os).

There might be a debate on how useful AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
is in the context of memset/memcpy but that needs more analysis and I would
say should be a separate patch.
 
> >> > +
> >> > +  /* Convert len to bits to make the rest of the co

Re: [patch] lto test: Add dg-require-effective-target lto on test using -flto

2020-11-03 Thread Richard Biener via Gcc-patches
On Mon, Nov 2, 2020 at 6:25 PM Olivier Hainque  wrote:
>
> Hello,
>
> This patch is a proposal to add a
>
>   /* { dg-require-effective-target lto } */
>
> directive to a "dg-do link" test using -flto, to
> prevent failure on targets missing LTO support such
> as VxWorks for it's "kernel" mode where modules are
> only partially linked.
>
> I have checked that the test still runs and passes
> on a regular native x86_64-linux environment.
>
> Ok to commit ?

Sure - consider such fixes obvious in the future.

Richard.

> Thanks in advance,
>
> Best Regards,
>
> Olivier
>
> 2020-11-02  Olivier Hainque  
>
> testuite/
> * gcc.dg/tree-ssa/pr71077.c: Add
> dg-require-effective-target lto.
>
>
>


Re: [PATCH] Fix PR97205

2020-11-03 Thread Bernd Edlinger



On 11/3/20 10:34 AM, Richard Biener wrote:
> On Mon, 2 Nov 2020, Bernd Edlinger wrote:
> 
>> On 11/2/20 3:07 PM, Richard Biener wrote:
>>> On Mon, 2 Nov 2020, Bernd Edlinger wrote:
>>>
 Hi,

 this makes sure that stack allocated SSA_NAMEs are
 at least MODE_ALIGNED.  Also increase the MEM_ALIGN
 for the corresponding rtl objects.


 Tested on x86_64-pc-linux-gnu and arm-none-eabi.

 OK for trunk?
>>>
>>>
>>> @@ -1022,6 +1030,14 @@ expand_one_stack_var_at (tree decl, rtx base,
>>> unsigned base_align,
>>>  }
>>>  
>>>set_rtl (decl, x);
>>> +
>>> +  if (TREE_CODE (decl) == SSA_NAME
>>> +  && GET_MODE (x) != BLKmode
>>> +  && MEM_ALIGN (x) < GET_MODE_ALIGNMENT (GET_MODE (x)))
>>> +{
>>> +  gcc_checking_assert (GET_MODE_ALIGNMENT (GET_MODE (x)) <=
>>> base_align);
>>> +  set_mem_align (x, GET_MODE_ALIGNMENT (GET_MODE (x)));
>>> +}
>>>  }
>>>  
>>>
>>> I wonder whether we cannot "fix" set_rtl to not call
>>> set_mem_attributes in this path, maybe directly call
>>> set_mem_align there instead?  That is, the preceeding
>>> code for ! SSA_NAME already tries to adjust alignment
>>> to honor that of the actual stack slot - IMHO the
>>> non-SSA and SSA cases should be merged after this
>>> patch, but maybe simply by calling set_mem_align
>>> instead of doing the DECL_ALIGN frobbing and do
>>> the alignment compute also for SSA_NAMEs?
>>>
>>> The other pieces look OK but the above is a bit ugly
>>> at the moment.
>>>
>>
>> Hmm, how about this?
> 
> That would work for me.  Guess removing the DECL_ALIGN frobbing
> in the != SSA_NAME path didn't work out or you didn't try out
> of caution?
> 

I didn't try, since it felt simply more correct this way,
and get_object_alignment would probably give a different
answer since it uses DECL_ALIGN too.


Bernd.


[PATCH] tree-optimization/97678 - fix SLP induction epilogue vectorization

2020-11-03 Thread Richard Biener
This restores not tracking SLP nodes for induction initial values
in not nested context because this interferes with peeling and
epilogue vectorization.

Boostrapped and tested on x86_64-unknown-linux-gnu, pushed.

2020-11-03  Richard Biener  

PR tree-optimization/97678
* tree-vect-slp.c (vect_build_slp_tree_2): Do not track
the initial values of inductions when not nested.
* tree-vect-loop.c (vectorizable_induction): Look at
PHI node initial values again for SLP and not nested
inductions.  Handle LOOP_VINFO_MASK_SKIP_NITERS and cost
invariants.

* gcc.dg/vect/pr97678.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/pr97678.c | 29 +
 gcc/tree-vect-loop.c| 49 ++---
 gcc/tree-vect-slp.c |  8 +++--
 3 files changed, 79 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr97678.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr97678.c 
b/gcc/testsuite/gcc.dg/vect/pr97678.c
new file mode 100644
index 000..ebe4a35bb3f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr97678.c
@@ -0,0 +1,29 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
+
+#include "tree-vect.h"
+
+int
+main ()
+{
+  unsigned int i = 0;
+  unsigned short b[158 * 2];
+
+  check_vect ();
+
+  for (i = 0; i < 158; i++)
+{
+  b[i * 2] = i * 7;
+  b[i * 2 + 1] = i * 8;
+}
+
+  for (i = 0; i < 158; ++i)
+if (b[i*2] != (unsigned short)(i*7)
+|| b[i*2+1] != (unsigned short)(i*8))
+  abort ();
+
+  return 0;
+}
+
+/* The init loop should be vectorized with SLP.  */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index fcea28935bc..6fa185daa28 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -7800,6 +7800,10 @@ vectorizable_induction (loop_vec_info loop_vinfo,
= record_stmt_cost (cost_vec,
SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
vector_stmt, stmt_info, 0, vect_body);
+ /* prologue cost for vec_init (if not nested) and step.  */
+ prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
+   scalar_to_vec,
+   stmt_info, 0, vect_prologue);
}
   else /* if (!slp_node) */
{
@@ -7858,9 +7862,15 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 cycles we have to reconstruct the step from SCEV data.  */
   unsigned group_size = SLP_TREE_LANES (slp_node);
   tree *steps = XALLOCAVEC (tree, group_size);
+  tree *inits = XALLOCAVEC (tree, group_size);
   stmt_vec_info phi_info;
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
-   steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+   {
+ steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+ if (!init_node)
+   inits[i] = gimple_phi_arg_def (as_a (phi_info->stmt),
+  pe->dest_idx);
+   }
 
   /* Now generate the IVs.  */
   unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
@@ -7875,16 +7885,39 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 ? build_real_from_wide (stept, lup_mul,
 UNSIGNED)
 : build_int_cstu (stept, lup_mul));
+  tree peel_mul = NULL_TREE;
+  if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+   {
+ stmts = NULL;
+ if (SCALAR_FLOAT_TYPE_P (stept))
+   peel_mul = gimple_build (&stmts, FLOAT_EXPR, stept,
+LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+ else
+   peel_mul = gimple_convert (&stmts, stept,
+  LOOP_VINFO_MASK_SKIP_NITERS 
(loop_vinfo));
+ peel_mul = gimple_build_vector_from_val (&stmts, step_vectype, 
peel_mul);
+ if (stmts)
+   {
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+ gcc_assert (!new_bb);
+   }
+   }
   unsigned ivn;
   auto_vec vec_steps;
   for (ivn = 0; ivn < nivs; ++ivn)
{
- tree_vector_builder elts (step_vectype, const_nunits, 1);
+ tree_vector_builder step_elts (step_vectype, const_nunits, 1);
+ tree_vector_builder init_elts (vectype, const_nunits, 1);
  tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
{
  tree elt = steps[(ivn*const_nunits + eltn) % group_size];
- elts.quick_push (elt);
+ step_elts.quick_push (elt);
+ if (!init_node)
+   {
+ 

Re: [PATCH] Fix PR97205

2020-11-03 Thread Richard Biener
On Tue, 3 Nov 2020, Bernd Edlinger wrote:

> 
> 
> On 11/3/20 10:34 AM, Richard Biener wrote:
> > On Mon, 2 Nov 2020, Bernd Edlinger wrote:
> > 
> >> On 11/2/20 3:07 PM, Richard Biener wrote:
> >>> On Mon, 2 Nov 2020, Bernd Edlinger wrote:
> >>>
>  Hi,
> 
>  this makes sure that stack allocated SSA_NAMEs are
>  at least MODE_ALIGNED.  Also increase the MEM_ALIGN
>  for the corresponding rtl objects.
> 
> 
>  Tested on x86_64-pc-linux-gnu and arm-none-eabi.
> 
>  OK for trunk?
> >>>
> >>>
> >>> @@ -1022,6 +1030,14 @@ expand_one_stack_var_at (tree decl, rtx base,
> >>> unsigned base_align,
> >>>  }
> >>>  
> >>>set_rtl (decl, x);
> >>> +
> >>> +  if (TREE_CODE (decl) == SSA_NAME
> >>> +  && GET_MODE (x) != BLKmode
> >>> +  && MEM_ALIGN (x) < GET_MODE_ALIGNMENT (GET_MODE (x)))
> >>> +{
> >>> +  gcc_checking_assert (GET_MODE_ALIGNMENT (GET_MODE (x)) <=
> >>> base_align);
> >>> +  set_mem_align (x, GET_MODE_ALIGNMENT (GET_MODE (x)));
> >>> +}
> >>>  }
> >>>  
> >>>
> >>> I wonder whether we cannot "fix" set_rtl to not call
> >>> set_mem_attributes in this path, maybe directly call
> >>> set_mem_align there instead?  That is, the preceeding
> >>> code for ! SSA_NAME already tries to adjust alignment
> >>> to honor that of the actual stack slot - IMHO the
> >>> non-SSA and SSA cases should be merged after this
> >>> patch, but maybe simply by calling set_mem_align
> >>> instead of doing the DECL_ALIGN frobbing and do
> >>> the alignment compute also for SSA_NAMEs?
> >>>
> >>> The other pieces look OK but the above is a bit ugly
> >>> at the moment.
> >>>
> >>
> >> Hmm, how about this?
> > 
> > That would work for me.  Guess removing the DECL_ALIGN frobbing
> > in the != SSA_NAME path didn't work out or you didn't try out
> > of caution?
> > 
> 
> I didn't try, since it felt simply more correct this way,
> and get_object_alignment would probably give a different
> answer since it uses DECL_ALIGN too.

OK, I see.

Richard.


Re: [patch] i386 tests: Add dg-require-effective-target fpic to gcc.target/i386 tests

2020-11-03 Thread Olivier Hainque


> On 3 Nov 2020, at 09:30, Jakub Jelinek  wrote:

> 70% of the tests you've changed have a target *-linux* or similar
> right above that line, what is the point of adding the fpic
> effective targets to those?  Those surely aren't run on vxWorks
> and on x86 Linux fpic is always supported.

Good point. We had quite a few actual failures
and I did a pretty systematic search, missing the
linux implication.

> No objection to adding it to the rest.

Ok, amended change below.

Thanks for your prompt feedback on this Jakub!

Best Regards,

Olivier


2020-11-03  Olivier Hainque  

gcc/testsuite/

* gcc.target/i386/pr45352-1.c: Add dg-require-effective-target fpic.
* gcc.target/i386/pr47602.c: Likewise.
* gcc.target/i386/pr55151.c: Likewise.
* gcc.target/i386/pr55458.c: Likewise.
* gcc.target/i386/pr56348.c: Likewise.
* gcc.target/i386/pr57097.c: Likewise.
* gcc.target/i386/pr65753.c: Likewise.
* gcc.target/i386/pr65915.c: Likewise.
* gcc.target/i386/pr66232-5.c: Likewise.
* gcc.target/i386/pr66334.c: Likewise.
* gcc.target/i386/pr66819-2.c: Likewise.
* gcc.target/i386/pr67265.c: Likewise.
* gcc.target/i386/pr81481.c: Likewise.
* gcc.target/i386/pr83994.c: Likewise.

---
 gcc/testsuite/gcc.target/i386/pr45352-1.c | 1 +
 gcc/testsuite/gcc.target/i386/pr47602.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr55151.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr55458.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr56348.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr57097.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr65753.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr65915.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr66232-5.c | 1 +
 gcc/testsuite/gcc.target/i386/pr66334.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr66819-2.c | 1 +
 gcc/testsuite/gcc.target/i386/pr67265.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr81481.c   | 1 +
 gcc/testsuite/gcc.target/i386/pr83994.c   | 1 +
 14 files changed, 14 insertions(+)

diff --git a/gcc/testsuite/gcc.target/i386/pr45352-1.c 
b/gcc/testsuite/gcc.target/i386/pr45352-1.c
index 5cd1bd842d80..f5e96b270166 100644
--- a/gcc/testsuite/gcc.target/i386/pr45352-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr45352-1.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-mtune=amdfam10 -O3 -fpeel-loops -fselective-scheduling2 
-fsel-sched-pipelining -fPIC" } */
 
 static int FIR_Tab_16[16][16];
diff --git a/gcc/testsuite/gcc.target/i386/pr47602.c 
b/gcc/testsuite/gcc.target/i386/pr47602.c
index fa5f5bd7d351..5ed1e1fd490c 100644
--- a/gcc/testsuite/gcc.target/i386/pr47602.c
+++ b/gcc/testsuite/gcc.target/i386/pr47602.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target ia32 } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-fPIC" } */
 
 /* Test verifies that %ebx is no longer fixed when generating PIC code on 
i686.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr55151.c 
b/gcc/testsuite/gcc.target/i386/pr55151.c
index 62da8cb7781e..d6255a83dee7 100644
--- a/gcc/testsuite/gcc.target/i386/pr55151.c
+++ b/gcc/testsuite/gcc.target/i386/pr55151.c
@@ -1,5 +1,6 @@
 /* PR rtl-optimization/55151 */
 /* { dg-do compile  { target { ! ia32 } } } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-fPIC" } */
 
 int a, b, c, d, e, f, g, h, i, j, k, l;
diff --git a/gcc/testsuite/gcc.target/i386/pr55458.c 
b/gcc/testsuite/gcc.target/i386/pr55458.c
index 7164ca905db2..1dea55c50c9e 100644
--- a/gcc/testsuite/gcc.target/i386/pr55458.c
+++ b/gcc/testsuite/gcc.target/i386/pr55458.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target ia32 } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-fPIC" } */
 
 /* Test verifies that %ebx is no longer fixed when generating PIC code on 
i686.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr56348.c 
b/gcc/testsuite/gcc.target/i386/pr56348.c
index c31814f60f47..93423d9190d9 100644
--- a/gcc/testsuite/gcc.target/i386/pr56348.c
+++ b/gcc/testsuite/gcc.target/i386/pr56348.c
@@ -1,5 +1,6 @@
 /* PR target/56348 */
 /* { dg-do compile { target ia32 } } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-O2 -fPIC -mfpmath=sse -msse2" } */
 
 typedef unsigned int size_t;
diff --git a/gcc/testsuite/gcc.target/i386/pr57097.c 
b/gcc/testsuite/gcc.target/i386/pr57097.c
index 2f0093840df5..debacbfc048b 100644
--- a/gcc/testsuite/gcc.target/i386/pr57097.c
+++ b/gcc/testsuite/gcc.target/i386/pr57097.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-O2 -fPIC" } */
 extern double ad[], bd[], cd[], dd[];
 extern long long all[], bll[], cll[], dll[];
diff --git a/gcc/testsuite/gcc.target/i386/pr65753.c 
b/gcc/testsuite/gcc.target/i386/pr65753.c
index 562f54bff6bd..117d6ed0cf17 100644
--- a/gcc/testsuite/gcc.target/i386/pr65753.c
+++ b/gcc/testsuite/gcc.target/i386/pr65753.c
@@ -1,4 +1,5 @@
 /* { d

Re: [patch] i386 tests: Add dg-require-effective-target fpic to gcc.target/i386 tests

2020-11-03 Thread Jakub Jelinek via Gcc-patches
On Tue, Nov 03, 2020 at 11:24:24AM +0100, Olivier Hainque wrote:
> 
> 
> > On 3 Nov 2020, at 09:30, Jakub Jelinek  wrote:
> 
> > 70% of the tests you've changed have a target *-linux* or similar
> > right above that line, what is the point of adding the fpic
> > effective targets to those?  Those surely aren't run on vxWorks
> > and on x86 Linux fpic is always supported.
> 
> Good point. We had quite a few actual failures
> and I did a pretty systematic search, missing the
> linux implication.
> 
> > No objection to adding it to the rest.
> 
> Ok, amended change below.
> 
> Thanks for your prompt feedback on this Jakub!

Ok, thanks.

> 2020-11-03  Olivier Hainque  
> 
> gcc/testsuite/
> 
> * gcc.target/i386/pr45352-1.c: Add dg-require-effective-target fpic.
> * gcc.target/i386/pr47602.c: Likewise.
> * gcc.target/i386/pr55151.c: Likewise.
> * gcc.target/i386/pr55458.c: Likewise.
> * gcc.target/i386/pr56348.c: Likewise.
> * gcc.target/i386/pr57097.c: Likewise.
> * gcc.target/i386/pr65753.c: Likewise.
> * gcc.target/i386/pr65915.c: Likewise.
> * gcc.target/i386/pr66232-5.c: Likewise.
> * gcc.target/i386/pr66334.c: Likewise.
> * gcc.target/i386/pr66819-2.c: Likewise.
> * gcc.target/i386/pr67265.c: Likewise.
> * gcc.target/i386/pr81481.c: Likewise.
> * gcc.target/i386/pr83994.c: Likewise.

Jakub



RE: [PATCH 1/x] arm: Add vld1_lane_bf16 + vldq_lane_bf16 intrinsics

2020-11-03 Thread Kyrylo Tkachov via Gcc-patches
Hi Andrea,

> -Original Message-
> From: Andrea Corallo 
> Sent: 26 October 2020 15:59
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov ; Richard Earnshaw
> ; nd 
> Subject: [PATCH 1/x] arm: Add vld1_lane_bf16 + vldq_lane_bf16 intrinsics
> 
> Hi all,
> 
> I'd like to submit the following patch implementing the bfloat16_t
> neon related load intrinsics: vld1_lane_bf16, vld1q_lane_bf16.
> 
> Please see refer to:
> ACLE 
> ISA  
> 
> Regtested and bootstrapped.
> 
> Okay for trunk?

Ok.
Thanks,
Kyrill


> 
>   Andrea



RE: [PATCH 2/x] arm: add vst1_lane_bf16 + vstq_lane_bf16 intrinsics

2020-11-03 Thread Kyrylo Tkachov via Gcc-patches
Hi Andrea,

> -Original Message-
> From: Andrea Corallo 
> Sent: 26 October 2020 16:02
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov ; Richard Earnshaw
> ; nd 
> Subject: [PATCH 2/x] arm: add vst1_lane_bf16 + vstq_lane_bf16 intrinsics
> 
> Hi all,
> 
> Second patch of the serie here adding vst1_lane_bf16, vst1q_lane_bf16
> bfloat16 related neon intrinsics.
> 
> Please see refer to:
> ACLE 
> ISA  
> 
> Regtested and bootstrapped.
> 
> Okay for trunk?

Ok.
Thanks,
Kyrill

> 
>   Andrea
> 


RE: [PATCH 4/x] arm: Add vst1_bf16 + vst1q_bf16 intrinsics

2020-11-03 Thread Kyrylo Tkachov via Gcc-patches
Hi Andrea,

> -Original Message-
> From: Andrea Corallo 
> Sent: 02 November 2020 09:02
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov ; Richard Earnshaw
> ; nd 
> Subject: [PATCH 4/x] arm: Add vst1_bf16 + vst1q_bf16 intrinsics
> 
> Hi all,
> 
> Forth patch of the serie here adding vst1_bf16, vst1q_bf16 bfloat16
> related neon intrinsics.
> 
> Please see refer to:
> ACLE 
> ISA  
> 
> Regtested and bootstrapped.
> 
> Thanks!
> 
>   Andrea
> 
> gcc/ChangeLog
> 
> 2020-10-29  Andrea Corallo  
> 
>   * config/arm/arm_neon.h (vst1_bf16, vst1q_bf16): Add intrinsics.
>   * config/arm/arm_neon_builtins.def : Touch for:
>   __builtin_neon_vst1v4bf, __builtin_neon_vst1v8bf.
> 

I see this patch also has the hunk:
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 33e8015b140..6dc5df93216 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -946,6 +946,9 @@ typedef struct {
 #define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
   VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
   VAR1 (T, N, M)
+#define VAR14(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M, O) \
+  VAR13 (T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR1 (T, N, O)
 
 /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def
and arm_acle_builtins.def.  The entries in arm_neon_builtins.def require

That is a duplicate from a previous patch in the series and doesn't appear in 
the ChangeLog here.
I think it's in here by mistake?
Otherwise ok.
Thanks,
Kyrill

> gcc/testsuite/ChangeLog
> 
> 2020-10-29  Andrea Corallo  
> 
>   * gcc.target/arm/simd/vst1_bf16_1.c: New test.



Avoid recursion in tree-inline.c

2020-11-03 Thread Jan Hubicka
Hi,
this patch avoids recursion in tree-inline.c by producing necessary
clones early.  This avoids an ICE on overuse of AUX pointer there.

Bootstrapped/regtested x86_64-linux, comitted.
Honza

gcc/ChangeLog:

2020-11-03  Jan Hubicka  

PR ipa/97578
* ipa-inline-transform.c (maybe_materialize_called_clones): New
function.
(inline_transform): Use it.

gcc/testsuite/ChangeLog:

2020-11-03  Jan Hubicka  

* gcc.c-torture/compile/pr97578.c: New test.

diff --git a/gcc/ipa-inline-transform.c b/gcc/ipa-inline-transform.c
index 4df1b7fb9ee..907a95cac5a 100644
--- a/gcc/ipa-inline-transform.c
+++ b/gcc/ipa-inline-transform.c
@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ipa-modref-tree.h"
 #include "ipa-modref.h"
 #include "symtab-thunks.h"
+#include "symtab-clones.h"
 
 int ncalls_inlined;
 int nfunctions_inlined;
@@ -695,6 +696,31 @@ preserve_function_body_p (struct cgraph_node *node)
   return false;
 }
 
+/* tree-inline can not recurse; materialize all function bodie we will need
+   during inlining.  This includes inlined functions, but also called functions
+   with param manipulation because IPA param manipulation attaches debug
+   statements to PARM_DECLs of called clone.  Materialize them if needed.
+
+   FIXME: This is somehwat broken by design because it does not play well
+   with partitioning.  */
+
+static void
+maybe_materialize_called_clones (cgraph_node *node)
+{
+  for (cgraph_edge *e = node->callees; e; e = e->next_callee)
+{
+  clone_info *info;
+
+  if (!e->inline_failed)
+   maybe_materialize_called_clones (e->callee);
+
+  cgraph_node *callee = cgraph_node::get (e->callee->decl);
+  if (callee->clone_of
+ && (info = clone_info::get (callee)) && info->param_adjustments)
+   callee->get_untransformed_body ();
+}
+}
+
 /* Apply inline plan to function.  */
 
 unsigned int
@@ -748,6 +774,7 @@ inline_transform (struct cgraph_node *node)
   ENTRY_BLOCK_PTR_FOR_FN (cfun)->count = node->count;
 }
 
+  maybe_materialize_called_clones (node);
   for (e = node->callees; e; e = next)
 {
   if (!e->inline_failed)
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr97578.c 
b/gcc/testsuite/gcc.c-torture/compile/pr97578.c
new file mode 100644
index 000..e007724fdae
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr97578.c
@@ -0,0 +1,11 @@
+int printf (const char *, ...);
+
+int a;
+static void b(int c) {
+  if (c)
+printf("%d", a);
+}
+void e() {
+  int d = 0;
+  b(d);
+}


Re: [PATCH 4/x] arm: Add vst1_bf16 + vst1q_bf16 intrinsics

2020-11-03 Thread Andrea Corallo via Gcc-patches
Kyrylo Tkachov  writes:
[...]
> I see this patch also has the hunk:
> diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
> index 33e8015b140..6dc5df93216 100644
> --- a/gcc/config/arm/arm-builtins.c
> +++ b/gcc/config/arm/arm-builtins.c
> @@ -946,6 +946,9 @@ typedef struct {
>  #define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
>VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
>VAR1 (T, N, M)
> +#define VAR14(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M, O) \
> +  VAR13 (T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
> +  VAR1 (T, N, O)
>  
>  /* The builtin data can be found in arm_neon_builtins.def, 
> arm_vfp_builtins.def
> and arm_acle_builtins.def.  The entries in arm_neon_builtins.def require
>
> That is a duplicate from a previous patch in the series and doesn't appear in 
> the ChangeLog here.
> I think it's in here by mistake?
> Otherwise ok.
> Thanks,
> Kyrill

Hi Kyrill,

thanks for reviewing this.

Unless I'm wrong I see this hunk present in 3/x but not in 4/x.

  Andrea


RE: [PATCH 1/5] [PR target/96342] Change field "simdlen" into poly_uint64

2020-11-03 Thread yangyang (ET)
Hi,

I have revised the patch based on your suggestions. I use multiple_p instead of 
!multiple_p if the eq situation is OK to make it easier to understand.

> >> > if (n->simdclone->inbranch)
> >> >   this_badness += 2048;
> >> > int target_badness = targetm.simd_clone.usable (n); @@ -3988,19
> >> > +3988,19 @@ vectorizable_simd_clone_call (vec_info *vinfo,
> >> > +stmt_vec_info
> >> stmt_info,
> >> > arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type,
> >> >
> slp_node);
> >> > if (arginfo[i].vectype == NULL
> >> > -   || (simd_clone_subparts (arginfo[i].vectype)
> >> > -   > bestn->simdclone->simdlen))
> >> > +   || (known_gt (simd_clone_subparts (arginfo[i].vectype),
> >> > + bestn->simdclone->simdlen)))
> >>
> >> Here too I think we want constant_multiple_p:
> >>
> >>   || !constant_multiple_p (bestn->simdclone->simdlen,
> >>simd_clone_subparts
> >> (arginfo[i].vectype))
> >>
> >
> > Use multiple_p here since the multiple is not needed.
>
> True, but in the case of vectorisation, we need to generate a constant number
> of copies at compile time.  If we don't enforce a constant multiple, we might
> end up trying to use an Advanced SIMD routine when vectorising for SVE.
>
> The reason we don't have a two-argument version of constant_multiple_p is
> that so far nothing has needed it (at least AFAIK).  There's no conceptual
> problem with adding it though.  I'm happy to do that if it would help.
>

Two-argument versions of constant_multiple_p are added in the v3-patch. Could 
you please check if the added versions are OK ?

Bootstrap and tested on both aarch64 and x86 Linux platform, no new regression 
witnessed.

Any suggestions?

Thanks,
Yang Yang


PR96342-part1-v3.patch
Description: PR96342-part1-v3.patch


[PATCH] bootstrap/97666 - fix array of bool allocation

2020-11-03 Thread Richard Biener
This fixes the bad assumption that sizeof (bool) == 1

Bootstrap / regtest running on x86_64-unknown-linux-gnu.

2020-11-03  Richard Biener  

PR bootstrap/97666
* tree-vect-slp.c (vect_build_slp_tree_2): Scale
allocation of skip_args by sizeof (bool).
---
 gcc/tree-vect-slp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index e97fbe897a7..08018a1d799 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -1428,7 +1428,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 
   /* If the SLP node is a PHI (induction or reduction), terminate
  the recursion.  */
-  bool *skip_args = XALLOCAVEC (bool, nops);
+  bool *skip_args = XALLOCAVEC (bool, sizeof (bool) * nops);
   memset (skip_args, 0, nops);
   if (loop_vec_info loop_vinfo = dyn_cast  (vinfo))
 if (gphi *stmt = dyn_cast  (stmt_info->stmt))
-- 
2.26.2


RE: [PATCH 4/x] arm: Add vst1_bf16 + vst1q_bf16 intrinsics

2020-11-03 Thread Kyrylo Tkachov via Gcc-patches



> -Original Message-
> From: Andrea Corallo 
> Sent: 03 November 2020 11:01
> To: Kyrylo Tkachov 
> Cc: gcc-patches@gcc.gnu.org; Richard Earnshaw
> ; nd 
> Subject: Re: [PATCH 4/x] arm: Add vst1_bf16 + vst1q_bf16 intrinsics
> 
> Kyrylo Tkachov  writes:
> [...]
> > I see this patch also has the hunk:
> > diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
> > index 33e8015b140..6dc5df93216 100644
> > --- a/gcc/config/arm/arm-builtins.c
> > +++ b/gcc/config/arm/arm-builtins.c
> > @@ -946,6 +946,9 @@ typedef struct {
> >  #define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
> >VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
> >VAR1 (T, N, M)
> > +#define VAR14(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M, O) \
> > +  VAR13 (T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
> > +  VAR1 (T, N, O)
> >
> >  /* The builtin data can be found in arm_neon_builtins.def,
> arm_vfp_builtins.def
> > and arm_acle_builtins.def.  The entries in arm_neon_builtins.def require
> >
> > That is a duplicate from a previous patch in the series and doesn't appear
> in the ChangeLog here.
> > I think it's in here by mistake?
> > Otherwise ok.
> > Thanks,
> > Kyrill
> 
> Hi Kyrill,
> 
> thanks for reviewing this.
> 
> Unless I'm wrong I see this hunk present in 3/x but not in 4/x.

Ah, you're right, I confused my views.
I need a better editor...
This is ok,
Kyrill

> 
>   Andrea


RE: [PATCH 3/x] arm: Add vld1_bf16 + vld1q_bf16 intrinsics

2020-11-03 Thread Kyrylo Tkachov via Gcc-patches



> -Original Message-
> From: Andrea Corallo 
> Sent: 02 November 2020 09:01
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov ; Richard Earnshaw
> ; nd 
> Subject: [PATCH 3/x] arm: Add vld1_bf16 + vld1q_bf16 intrinsics
> 
> Hi all,
> 
> Third patch of the serie here adding vld1_bf16, vld1q_bf16 bfloat16
> related neon intrinsics.
> 
> Please see refer to:
> ACLE 
> ISA  
> 
> Regtested and bootstrapped.
> 
> Thanks!
> 

Ok.
Thanks,
Kyrill

>   Andrea
> 
> gcc/ChangeLog
> 
> 2020-10-29  Andrea Corallo  
> 
>   * config/arm/arm-builtins.c (VAR14): Define macro.
>   * config/arm/arm_neon_builtins.def: Touch for:
>   __builtin_neon_vld1v4bf, __builtin_neon_vld1v8bf.
>   * config/arm/arm_neon.h (vld1_bf16, vld1q_bf16): Add intrinsics.
> 
> gcc/testsuite/ChangeLog
> 
> 2020-10-29  Andrea Corallo  
> 
>   * gcc.target/arm/simd/vld1_bf16_1.c: New test.
> 


Re: [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector

2020-11-03 Thread Dennis Zhang via Gcc-patches

Hi Richard,

On 10/30/20 2:07 PM, Richard Sandiford wrote:

Dennis Zhang  writes:

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
b/gcc/config/aarch64/aarch64-simd-builtins.def
index 332a0b6b1ea..39ebb776d1d 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -719,6 +719,9 @@
VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
  
+  /* Implemented by aarch64_vget_halfv8bf.  */

+  VAR1 (GETREG, vget_half, 0, ALL, v8bf)


This should be AUTO_FP, since it doesn't have any side-effects.
(As before, we should probably rename the flag, but that's separate work.)


+
/* Implemented by aarch64_simd_mmlav16qi.  */
VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd1e6f..f62c52ca327 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7159,6 +7159,19 @@
[(set_attr "type" "neon_dot")]
  )
  
+;; vget_low/high_bf16

+(define_expand "aarch64_vget_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")
+   (match_operand:SI 2 "aarch64_zero_or_1")]
+  "TARGET_BF16_SIMD"
+{
+  int hbase = INTVAL (operands[2]);
+  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);


I think this needs to be:

   aarch64_simd_vect_par_cnst_half

instead.  The issue is that on big-endian targets, GCC assumes vector
lane 0 is in the high part of the register, whereas for AArch64 it's
always in the low part of the register.  So we convert from AArch64
numbering to GCC numbering when generating the rtx and then take
endianness into account when matching the rtx later.

It would be good to have -mbig-endian tests that make sure we generate
the right instruction for each function (i.e. we get them the right way
round).  I guess it would be good to test that for little-endian too.



I've updated the expander using aarch64_simd_vect_par_cnst_half.
And the expander is divided into two for getting low and high half 
seperately.
It's tested for aarch64-none-linux-gnu and aarch64_be-none-linux-gnu 
targets with new tests including -mbig-endian option.



+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
+  DONE;
+})
+
  ;; bfmmla
  (define_insn "aarch64_bfmmlaqv4sf"
[(set (match_operand:V4SF 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 215fcec5955..0c8bc2b0c73 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -84,6 +84,10 @@
 (ior (match_test "op == constm1_rtx")
  (match_test "op == const1_rtx"))
  
+(define_predicate "aarch64_zero_or_1"

+  (and (match_code "const_int")
+   (match_test "op == const0_rtx || op == const1_rtx")))


zero_or_1 looked odd to me, feels like it should be 0_or_1 or zero_or_one.
But I see that it's for consistency with aarch64_reg_zero_or_m1_or_1,
so let's keep it as-is.



This predicate is removed since there is no need of the imm operand in 
the new expanders.


Thanks for the reviews.
Is it OK for trunk now?

Cheers
Dennis


diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index eb8e6f7b3d8..f26a96042bc 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -722,6 +722,10 @@
   VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
   VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
 
+  /* Implemented by aarch64_vget_lo/hi_halfv8bf.  */
+  VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf)
+  VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf)
+
   /* Implemented by aarch64_simd_mmlav16qi.  */
   VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
   VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 381a702eba0..af29a2f26f5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7159,6 +7159,27 @@
   [(set_attr "type" "neon_dot")]
 )
 
+;; vget_low/high_bf16
+(define_expand "aarch64_vget_lo_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")]
+  "TARGET_BF16_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false);
+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
+  DONE;
+})
+
+(define_expand "aarch64_vget_hi_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")]
+  "TARGET_BF16_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, true);
+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
+  DONE;
+})
+
 ;; bfmmla
 (define_insn "aarch64_bfmmlaqv4sf"
   [(set (match_operand:V4SF 0 "register_operand" "=w")
diff --git

Re: [patch] lto test: Add dg-require-effective-target lto on test using -flto

2020-11-03 Thread Olivier Hainque



> On 3 Nov 2020, at 11:03, Richard Biener  wrote:

> Sure - consider such fixes obvious in the future.

Understood. Thanks for your prompt feedback Richard :)

Cheers,

Olivier

>>* gcc.dg/tree-ssa/pr71077.c: Add
>>dg-require-effective-target lto.




RE: [PATCH 5/x] arm: Add vldN_lane_bf16 + vldNq_lane_bf16 intrisics

2020-11-03 Thread Kyrylo Tkachov via Gcc-patches
HI Andrea,

> -Original Message-
> From: Andrea Corallo 
> Sent: 02 November 2020 09:03
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov ; Richard Earnshaw
> ; nd 
> Subject: [PATCH 5/x] arm: Add vldN_lane_bf16 + vldNq_lane_bf16 intrisics
> 
> Hi all,
> 
> 5th patch of the serie here adding vld2_lane_bf16, vld2q_lane_bf16,
> vld3_lane_bf16, vld3q_lane_bf16, vld4_lane_bf16, vld4q_lane_bf16
> related neon intrinsics.
> 
> Please see refer to:
> ACLE 
> ISA  
> 
> Regtested and bootstrapped.
> 
> Thanks!
> 
>   Andrea
> 
> gcc/ChangeLog
> 
> 2020-10-29  Andrea Corallo  
> 
>   * config/arm/arm_neon.h (vld2_lane_bf16, vld2q_lane_bf16)
>   (vld3_lane_bf16, vld3q_lane_bf16, vld4_lane_bf16)
>   (vld4q_lane_bf16): Add intrinsics.
>   * config/arm/arm_neon_builtins.def: Touch for:
>   __builtin_neon_vld2_lanev4bf, __builtin_neon_vld2_lanev8bf,
>   __builtin_neon_vld3_lanev4bf, __builtin_neon_vld3_lanev8bf,
>   __builtin_neon_vld4_lanev4bf, __builtin_neon_vld4_lanev8bf.
>   * config/arm/iterators.md (VQ_HS): Add V8BF to the iterator.
> 

I think this part:
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index c70e3bc2731..8c0884518df 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -116,7 +116,7 @@
 (define_mode_iterator VQ2BF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI 
V4SF])
 
 ;; Quad-width vector modes with 16- or 32-bit elements
-(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF])
+(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF V8BF])

The V8BF needs to be guarded like so (V8BF "TARGET_BF16_SIMD") to make sure 
it's not enabled when bfloat16 is not available.

Ok with that change.
Thanks,
Kyrill


> gcc/testsuite/ChangeLog
> 
> 2020-10-29  Andrea Corallo  
> 
>   * gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c:
>   Run it also for the arm backend.
>   * gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c:
>   Likewise.
>   * gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c:
>   Likewise.
>   * gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c:
>   Likewise.
>   * gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c:
>   Likewise.
>   * gcc.target/arm/simd/vldn_lane_bf16_1.c: New test.



RE: [PATCH 6/x] arm: Add vstN_lane_bf16 + vstNq_lane_bf16 intrisics

2020-11-03 Thread Kyrylo Tkachov via Gcc-patches



> -Original Message-
> From: Andrea Corallo 
> Sent: 02 November 2020 09:04
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov ; Richard Earnshaw
> ; nd 
> Subject: [PATCH 6/x] arm: Add vstN_lane_bf16 + vstNq_lane_bf16 intrisics
> 
> Hi all,
> 
> last patch for this the serie adding vst2_lane_bf16, vst2q_lane_bf16,
> vst3_lane_bf16, vst3q_lane_bf16, vst4_lane_bf16, vst4q_lane_bf16
> related neon intrinsics.
> 
> Please see refer to:
> ACLE 
> ISA  
> 
> Regtested and bootstrapped.
> 

Ok.
Thanks,
Kyrill

> Thanks!
> 
>   Andrea



Re: [PATCH] aarch64: Add backend support for expanding __builtin_memset

2020-11-03 Thread Richard Sandiford via Gcc-patches
Sudakshina Das  writes:
>> -Original Message-
>> From: Richard Sandiford 
>> Sent: 30 October 2020 19:56
>> To: Sudakshina Das 
>> Cc: Wilco Dijkstra ; gcc-patches@gcc.gnu.org;
>> Kyrylo Tkachov ; Richard Earnshaw
>> 
>> Subject: Re: [PATCH] aarch64: Add backend support for expanding
>> __builtin_memset
>> 
>> > +  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));  dst =
>> > + adjust_automodify_address (dst, VOIDmode, base, 0);
>> > +
>> > +  /* Prepare the val using a DUP v0.16B, val.  */  if (CONST_INT_P
>> > + (val))
>> > +{
>> > +  val = force_reg (QImode, val);
>> > +}
>> > +  src = gen_reg_rtx (V16QImode);
>> > +  emit_insn (gen_aarch64_simd_dupv16qi(src, val));
>> 
>> I think we should use:
>> 
>>   src = expand_vector_broadcast (V16QImode, val);
>> 
>> here (without the CONST_INT_P check), so that for constants we just move a
>> constant directly into a register.
>>
>
> Sorry to bring this up again. When I tried expand_vector_broadcast, I 
> see the following behaviour:
> for __builtin_memset(p, 1, 24) where the duplicated constant fits
> moviv0.16b, 0x1
> mov x1, 72340172838076673
> str x1, [x0, 16]
> str q0, [x0]
> and an ICE for __builtin_memset(p, 1, 32) where I am guessing the duplicated
> constant does not fit
> x.c:7:30: error: unrecognizable insn:
> 7 | { __builtin_memset(p, 1, 32);}
>   |  ^
> (insn 8 7 0 2 (parallel [
> (set (mem:V16QI (reg:DI 94) [0 MEM  [(void 
> *)p_2(D)]+0 S16 A8])
> (const_vector:V16QI [
> (const_int 1 [0x1]) repeated x16
> ]))
> (set (mem:V16QI (plus:DI (reg:DI 94)
> (const_int 16 [0x10])) [0 MEM  [(void 
> *)p_2(D)]+16 S16 A8])
> (const_vector:V16QI [
> (const_int 1 [0x1]) repeated x16
> ]))
> ]) "x.c":7:3 -1
>  (nil))
> during RTL pass: vregs

Ah, yeah, I guess we need to call force_reg on the result.

>> So yeah, I'm certainly not questioning the speed_p value of 256.
>> I'm sure you and Wilco have picked the best value for that.  But -Os stuff 
>> can
>> usually be justified on first principles and I wasn't sure where the value 
>> of 128
>> came from.
>>
>
> I had another chat with Wilco about the 128byte value for !speed_p. We
> estimate the average number of instructions upto 128byte would be ~3 which
> is similar to do a memset call. But I did go back and think about the tuning
> argument of  AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS a bit more because
> you are right that based on that the average instructions can become double.
> I would propose using 256/128 based on speed_p but halving the value based on 
> the
> tune parameter. Obviously the assumption here is that we are respecting the 
> core's
> choice of avoiding stp of q registers (given that I do not see other uses of
> AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS being changed by -Os).

Yeah, but I think the lack of an -Os check in the existing code might be
a mistake.  The point is that STP Q is smaller than two separate STR Qs,
so using it is a size optimisation even if it's not a speed optimisation.
And like I say, -Os isn't supposed to be striking a balance between
size and speed: it's supposed to be going for size quite aggressively.

So TBH I have slight preference for keeping the current value and only
checking the tuning flag for speed_p.  But I agree that halving the
value would be self-consistent, so if you or Wilco believe strongly
that halving is better, that'd be OK with me too.

> There might be a debate on how useful AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
> is in the context of memset/memcpy but that needs more analysis and I would
> say should be a separate patch.

Agreed.

>> >> > +  if (n > 0 && n < copy_limit / 2)
>> >> > +   {
>> >> > + next_mode = smallest_mode_for_size (n, MODE_INT);
>> >> > + /* Last 1-byte causes the compiler to optimize to STRB when it
>> >> should
>> >> > +use STR Bx, [mem] since we already used SIMD registers.
>> >> > +So force it to HImode.  */
>> >> > + if (next_mode == QImode)
>> >> > +   next_mode = HImode;
>> >>
>> >> Is this always better?  E.g. for variable inputs and zero it seems
>> >> quite natural to store the original scalar GPR.
>> >>
>> >> If we do do this, I think we should assert before the loop that n > 1.
>> >>
>> >> Also, it would be good to cover this case in the tests.
>> >
>> > To give a background on this:
>> > So the case in point here is when we are copying the _last_ 1 byte. So
>> > the following Void foo (void *p) { __builtin_memset (p, 1, 3); } The
>> > compiler was generating
>> > moviv0.16b, 0x1
>> > mov w1, 1
>> > strbw1, [x0, 2]
>> > str h0, [x0]
>> > ret
>> > This is because after my expansion in subsequent passes it would see
>> > (insn 13 12

[PATCH] fortran/97652 - workaround missing canonicalization of PDT types

2020-11-03 Thread Richard Biener
This marks PDT types as needing structural comparison for TBAA
if we didn't pick up a canonical variant (which we should IMHO
always do).  This workaround fixes the gfortran.dg/pdt_14.f03
fail which materializes as testsuite timeout which is quite
annoying.

Bootstrap / regtest pending on x86_64-unknown-linux-gnu.

OK?

2020-11-03  Richard Biener  

PR fortran/97652
gcc/fortran
* trans-types.c (gfc_get_derived_type): When we didn't find
a canonical type mark it for structual equality.
---
 gcc/fortran/trans-types.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/fortran/trans-types.c b/gcc/fortran/trans-types.c
index b7129dcbe6d..4643fff243f 100644
--- a/gcc/fortran/trans-types.c
+++ b/gcc/fortran/trans-types.c
@@ -2647,6 +2647,8 @@ gfc_get_derived_type (gfc_symbol * derived, int codimen)
   typenode = make_node (RECORD_TYPE);
   TYPE_NAME (typenode) = get_identifier (derived->name);
   TYPE_PACKED (typenode) = flag_pack_derived;
+  if (!got_canonical)
+   SET_TYPE_STRUCTURAL_EQUALITY (typenode);
   derived->backend_decl = typenode;
 }
 
-- 
2.26.2


Re: PING [Patch] x86: Enable GCC support for Intel AVX-VNNI extension

2020-11-03 Thread Hongtao Liu via Gcc-patches
ping^2, i hope this patch could land on GCC11.

On Wed, Oct 28, 2020 at 5:23 PM Hongyu Wang  wrote:
>
> Hongyu Wang  于2020年10月14日周三 上午11:27写道:
> >
> > Hi:
> >
> > This patch is about to support Intel AVX-VNNI instructions.
> >
> > AVX-VNNI is an equivalent to AVX512-VNNI with VEX encoding. The 
> > instructions are same, but with extra {vex} prefix to distinguish from 
> > AVX512-VNNI instructions in assembler.
> >
> > For more details, please refer to 
> > https://software.intel.com/content/dam/develop/external/us/en/documents/architecture-instruction-set-extensions-programming-reference.pdf
> >
> > Bootstrap ok, regression test on i386/x86 backend is ok.
> >
> > OK for master?
> >
> > 2020-10-13  Hongtao Liu  
> > Hongyu Wang  
> >
> > gcc/
> > * common/config/i386/cpuinfo.h (get_available_features):
> > Detect AVXVNNI.
> > * common/config/i386/i386-common.c
> > (OPTION_MASK_ISA2_AVXVNNI_SET,
> > OPTION_MASK_ISA2_AVXVNNI_UNSET, OPTION_MASK_ISA2_AVX2_UNSET):
> > New.
> > (ix86_hanlde_option): Handle -mavxvnni, unset avxvnni when
> > avx2 is disabled.
> > * common/config/i386/i386-cpuinfo.h (enum processor_features):
> > Add FEATURE_AVXVNNI.
> > * common/config/i386/i386-isas.h: Add ISA_NAMES_TABLE_ENTRY
> > for avxvnni.
> > * config.gcc: Add avxvnniintrin.h.
> > * config/i386/avx512vnniintrin.h: Remove 128/256 bit non-mask
> > intrinsics.
> > * config/i386/avxvnniintrin.h: New header file.
> > * config/i386/cpuid.h (bit_AVXVNNI): New.
> > * config/i386/i386-builtins.c (def_builtin): Handle AVXVNNI mask
> > for unified builtin.
> > * config/i386/i386-builtin.def (BDESC): Adjust AVX512VNNI
> > builtins for AVXVNNI.
> > * config/i386/i386-c.c (ix86_target_macros_internal): Define
> > __AVXVNNI__.
> > * config/i386/i386-expand.c (ix86_expand_builtin): Handle bisa
> > for AVXVNNI to support unified intrinsic name, since there is no
> > dependency between AVX512VNNI and AVXVNNI.
> > * config/i386/i386-options.c (isa2_opts): Add -mavxvnni.
> > (ix86_valid_target_attribute_inner_p): Handle avxnnni.
> > (ix86_valid_target_attribute_inner_p): Ditto.
> > * config/i386/i386.h (TARGET_AVXVNNI, TARGET_AVXVNNI_P,
> > TARGET_AVXVNNI_P, PTA_AVXVNNI): New.
> > (PTA_SAPPHIRERAPIDS): Add AVX_VNNI.
> > (PTA_ALDERLAKE): Likewise.
> > * config/i386/i386.md ("isa"): Add avxvnni, avx512vnnivl.
> > ("enabled"): Adjust for avxvnni and avx512vnnivl.
> > * config/i386/i386.opt: Add option -mavxvnni.
> > * config/i386/immintrin.h: Include avxvnniintrin.h.
> > * config/i386/sse.md (vpdpbusd_): Adjust for AVXVNNI.
> > (vpdpbusds_): Likewise.
> > (vpdpwssd_): Likewise.
> > (vpdpwssds_): Likewise.
> > (vpdpbusd_v16si): New.
> > (vpdpbusds_v16si): Likewise.
> > (vpdpwssd_v16si): Likewise.
> > (vpdpwssds_v16si): Likewise.
> > * doc/invoke.texi: Document -mavxvnni.
> > * doc/extend.texi: Document avxvnni.
> > * doc/sourcebuild.texi: Document target avxvnni.
> >
> > gcc/testsuite/
> >
> > * gcc.target/i386/avx512vl-vnni-1.c: Rename..
> > * gcc.target/i386/avx512vl-vnni-1a.c: To This.
> > * gcc.target/i386/avx512vl-vnni-1b.c: New test.
> > * gcc.target/i386/avx512vl-vnni-2.c: Ditto.
> > * gcc.target/i386/avx512vl-vnni-3.c: Ditto.
> > * gcc.target/i386/avx-vnni-1.c: Ditto.
> > * gcc.target/i386/avx-vnni-2.c: Ditto.
> > * gcc.target/i386/avx-vnni-3.c: Ditto.
> > * gcc.target/i386/avx-vnni-4.c: Ditto.
> > * gcc.target/i386/avx-vnni-5.c: Ditto.
> > * gcc.target/i386/avx-vnni-6.c: Ditto.
> > * gcc.target/i386/avx-vpdpbusd-2.c: Ditto.
> > * gcc.target/i386/avx-vpdpbusds-2.c: Ditto.
> > * gcc.target/i386/avx-vpdpwssd-2.c: Ditto.
> > * gcc.target/i386/avx-vpdpwssds-2.c: Ditto.
> > * gcc.target/i386/vnni_inline_error.c: Ditto.
> > * gcc.target/i386/avx512vnnivl-builtin.c: Ditto.
> > * gcc.target/i386/avxvnni-builtin.c: Ditto.
> > * gcc.target/i386/funcspec-56.inc: Add new target attribute.
> > * gcc.target/i386/pr83488-3.c: Adjust.
> > * gcc.target/i386/sse-12.c: Add -mavxvnni.
> > * gcc.target/i386/sse-13.c: Ditto.
> > * gcc.target/i386/sse-14.c: Ditto.
> > * gcc.target/i386/sse-22.c: Ditto.
> > * gcc.target/i386/sse-23.c: Ditto.
> > * g++.dg/other/i386-2.C: Ditto.
> > * g++.dg/other/i386-3.C: Ditto.
> > * lib/target-supports.exp (check_effective_target_avxvnni):
> > New proc.
> >
> > --
> > Regards,
> >
> > Hongyu, Wang
>
> Rebased on 2020-10-27 trunk and PING.



-- 
BR,
Hongtao


Re: [patch] i386 tests: Add dg-require-effective-target fpic to gcc.target/i386 tests

2020-11-03 Thread Olivier Hainque


> On 3 Nov 2020, at 11:25, Jakub Jelinek  wrote:
> 
> Ok, thanks.

Thanks Jakub!

I have a couple of similar ones (for fpic) for non-target-specific
tests in preparation.

Then JIC, two smaller ones in the same spirit already sent:

https://gcc.gnu.org/pipermail/gcc-patches/2020-October/557477.html
(dg-require-profiling on i386 tests using -pg)

and

https://gcc.gnu.org/pipermail/gcc-patches/2020-November/557738.html
(dg-require-effective-target fopenmp on g++ test)




RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3

2020-11-03 Thread xiezhiheng
> -Original Message-
> From: Richard Sandiford [mailto:richard.sandif...@arm.com]
> Sent: Friday, October 30, 2020 6:24 PM
> To: xiezhiheng 
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng  writes:
> >> -Original Message-
> >> From: Richard Sandiford [mailto:richard.sandif...@arm.com]
> >> Sent: Monday, October 26, 2020 9:03 PM
> >> To: xiezhiheng 
> >> Cc: Richard Biener ;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> Thanks, pushed to trunk.
> >>
> >
> > Thanks, and I made the patch for float conversion intrinsics.
> 
> LGTM, thanks.  Pushed.
> 

Thanks.  And I made two separate patches for these two groups, compare 
intrinsics
and encryption algorithm (AES/SHA/SM3/SM4) intrinsics.

Note: It does not matter which patch is applied first.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng



diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9f743ecc89a..ba5e3dc7c55 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-03  Zhiheng Xie  
+   Nannan Zheng  
+
+   * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+   for compare intrinsics.
+

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9f743ecc89a..d6b943fc0df 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-03  Zhiheng Xie  
+   Nannan Zheng  
+
+   * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+   for AES/SHA/SM3/SM4 intrinsics.
+


compare-v1.patch
Description: compare-v1.patch


encryption-v1.patch
Description: encryption-v1.patch


Re: [PATCH] fortran/97652 - workaround missing canonicalization of PDT types

2020-11-03 Thread Paul Richard Thomas via Gcc-patches
Hi Richi,

That's OK for master and as far back as you have the fortitude to go.

Thanks for the fix.

I agree that we should always pick up a canonical variant. When I finally
get "a minute or two" to fix the rather fundamental problems with PDTs, I
will make sure that this goes away. I am also curious about the time out -
I'll take a look.

Paul


On Tue, 3 Nov 2020 at 11:35, Richard Biener  wrote:

> This marks PDT types as needing structural comparison for TBAA
> if we didn't pick up a canonical variant (which we should IMHO
> always do).  This workaround fixes the gfortran.dg/pdt_14.f03
> fail which materializes as testsuite timeout which is quite
> annoying.
>
> Bootstrap / regtest pending on x86_64-unknown-linux-gnu.
>
> OK?
>
> 2020-11-03  Richard Biener  
>
> PR fortran/97652
> gcc/fortran
> * trans-types.c (gfc_get_derived_type): When we didn't find
> a canonical type mark it for structual equality.
> ---
>  gcc/fortran/trans-types.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/gcc/fortran/trans-types.c b/gcc/fortran/trans-types.c
> index b7129dcbe6d..4643fff243f 100644
> --- a/gcc/fortran/trans-types.c
> +++ b/gcc/fortran/trans-types.c
> @@ -2647,6 +2647,8 @@ gfc_get_derived_type (gfc_symbol * derived, int
> codimen)
>typenode = make_node (RECORD_TYPE);
>TYPE_NAME (typenode) = get_identifier (derived->name);
>TYPE_PACKED (typenode) = flag_pack_derived;
> +  if (!got_canonical)
> +   SET_TYPE_STRUCTURAL_EQUALITY (typenode);
>derived->backend_decl = typenode;
>  }
>
> --
> 2.26.2
>


-- 
"If you can't explain it simply, you don't understand it well enough" -
Albert Einstein


[PATCH] testsuite: Fix gcc.target/i386/zero-scratch-regs-*.c scan-asm directives

2020-11-03 Thread Uros Bizjak via Gcc-patches
Improve zero-scratch-regs-*.c scan-asm regexps
and add target selectors for 32bit targets.

2020-11-03  Uroš Bizjak  

gcc/testsuite/ChangeLog:

* gcc.target/i386/zero-scratch-regs-1.c: Add ia32 target
selector where appropriate.  Improve scan-assembler regexp.
* gcc.target/i386/zero-scratch-regs-2.c: Ditto.
* gcc.target/i386/zero-scratch-regs-3.c: Ditto.
* gcc.target/i386/zero-scratch-regs-4.c: Ditto.
* gcc.target/i386/zero-scratch-regs-5.c: Ditto.
* gcc.target/i386/zero-scratch-regs-6.c: Ditto.
* gcc.target/i386/zero-scratch-regs-7.c: Ditto.
* gcc.target/i386/zero-scratch-regs-8.c: Ditto.
* gcc.target/i386/zero-scratch-regs-9.c: Ditto.
* gcc.target/i386/zero-scratch-regs-10.c: Ditto.
* gcc.target/i386/zero-scratch-regs-13.c: Ditto.
* gcc.target/i386/zero-scratch-regs-14.c: Ditto.
* gcc.target/i386/zero-scratch-regs-15.c: Ditto.
* gcc.target/i386/zero-scratch-regs-16.c: Ditto.
* gcc.target/i386/zero-scratch-regs-17.c: Ditto.
* gcc.target/i386/zero-scratch-regs-18.c: Ditto.
* gcc.target/i386/zero-scratch-regs-19.c: Ditto.
* gcc.target/i386/zero-scratch-regs-20.c: Ditto.
* gcc.target/i386/zero-scratch-regs-21.c: Ditto.
* gcc.target/i386/zero-scratch-regs-22.c: Ditto.
* gcc.target/i386/zero-scratch-regs-23.c: Ditto.
* gcc.target/i386/zero-scratch-regs-24.c: Ditto.
* gcc.target/i386/zero-scratch-regs-25.c: Ditto.
* gcc.target/i386/zero-scratch-regs-26.c: Ditto.
* gcc.target/i386/zero-scratch-regs-27.c: Ditto.
* gcc.target/i386/zero-scratch-regs-28.c: Ditto.
* gcc.target/i386/zero-scratch-regs-29.c: Ditto.
* gcc.target/i386/zero-scratch-regs-30.c: Ditto.
* gcc.target/i386/zero-scratch-regs-31.c: Ditto.

Tested on x86_64-linux-gnu {,-m32} and pushed.

Uros.
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-1.c 
b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-1.c
index 9f61dc4c863..1ea6de880aa 100644
--- a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-1.c
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-1.c
@@ -8,5 +8,5 @@ foo (void)
 
 /* { dg-final { scan-assembler-not "vzeroall" } } */
 /* { dg-final { scan-assembler-not "%xmm" } } */
-/* { dg-final { scan-assembler-not "xorl\[ \t\]*%" } } */
-/* { dg-final { scan-assembler-not "movl\[ \t\]*%" } } */
+/* { dg-final { scan-assembler-not "xorl\[ \t\]+%" } } */
+/* { dg-final { scan-assembler-not "movl\[ \t\]+%" } } */
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-10.c 
b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-10.c
index 09048e57f94..389b1142264 100644
--- a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-10.c
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-10.c
@@ -11,11 +11,11 @@ foo (int x)
 
 /* { dg-final { scan-assembler-not "vzeroall" } } */
 /* { dg-final { scan-assembler-not "%xmm" } } */
-/* { dg-final { scan-assembler "xorl\[ \t\]*%edx, %edx" } } */
-/* { dg-final { scan-assembler "movl\[ \t\]*%edx, %ecx" } } */
-/* { dg-final { scan-assembler "movl\[ \t\]*%edx, %esi" { target { ! ia32 } } 
} } */
-/* { dg-final { scan-assembler "movl\[ \t\]*%edx, %edi" { target { ! ia32 } } 
} } */
-/* { dg-final { scan-assembler "movl\[ \t\]*%edx, %r8d" { target { ! ia32 } } 
} } */
-/* { dg-final { scan-assembler "movl\[ \t\]*%edx, %r9d" { target { ! ia32 } } 
} } */
-/* { dg-final { scan-assembler "movl\[ \t\]*%edx, %r10d" { target { ! ia32 } } 
} } */
-/* { dg-final { scan-assembler "movl\[ \t\]*%edx, %r11d" { target { ! ia32 } } 
} } */
+/* { dg-final { scan-assembler "xorl\[ \t\]+%edx, %edx" } } */
+/* { dg-final { scan-assembler "movl\[ \t\]+%edx, %ecx" } } */
+/* { dg-final { scan-assembler "movl\[ \t\]+%edx, %esi" { target { ! ia32 } } 
} } */
+/* { dg-final { scan-assembler "movl\[ \t\]+%edx, %edi" { target { ! ia32 } } 
} } */
+/* { dg-final { scan-assembler "movl\[ \t\]+%edx, %r8d" { target { ! ia32 } } 
} } */
+/* { dg-final { scan-assembler "movl\[ \t\]+%edx, %r9d" { target { ! ia32 } } 
} } */
+/* { dg-final { scan-assembler "movl\[ \t\]+%edx, %r10d" { target { ! ia32 } } 
} } */
+/* { dg-final { scan-assembler "movl\[ \t\]+%edx, %r11d" { target { ! ia32 } } 
} } */
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-13.c 
b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-13.c
index 8b058e35540..07d8de7c66f 100644
--- a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-13.c
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-13.c
@@ -7,15 +7,15 @@ foo (void)
 }
 
 /* { dg-final { scan-assembler-not "vzeroall" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]*%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler-times "movaps\[ \t\]*%xmm0, %xmm\[0-9\]+" 7 { 
target { ia32 } } } } */
-/* { dg-final { scan-assembler-times "movaps\[ \t\]*%xmm0, %xmm\[0-9\]+" 15 { 
target { ! ia32 } } } } */
-/* { dg-final { scan-assembler "xorl\[ \t\]*%eax, %eax" } } */
-/* { dg-final { scan-assembler "movl\[ \t\]*%eax, %edx" } } */
-/* { dg-final { scan-assembler "movl\[ \t\]*%eax, %ecx" } } */

Re: [PATCH] fortran/97652 - workaround missing canonicalization of PDT types

2020-11-03 Thread Tobias Burnus

On 03.11.20 12:34, Richard Biener wrote:


This marks PDT types as needing structural comparison for TBAA
if we didn't pick up a canonical variant (which we should IMHO
always do).  This workaround fixes the gfortran.dg/pdt_14.f03
fail which materializes as testsuite timeout which is quite
annoying.

Bootstrap / regtest pending on x86_64-unknown-linux-gnu.
OK?


OK as workaround; still, the question is why it does not find its
sibling – thus, we shall keep the PR open (or open a follow-up PR) to
fix it properly.

Tobias



2020-11-03  Richard Biener  

  PR fortran/97652
gcc/fortran
  * trans-types.c (gfc_get_derived_type): When we didn't find
  a canonical type mark it for structual equality.
---
  gcc/fortran/trans-types.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/gcc/fortran/trans-types.c b/gcc/fortran/trans-types.c
index b7129dcbe6d..4643fff243f 100644
--- a/gcc/fortran/trans-types.c
+++ b/gcc/fortran/trans-types.c
@@ -2647,6 +2647,8 @@ gfc_get_derived_type (gfc_symbol * derived, int codimen)
typenode = make_node (RECORD_TYPE);
TYPE_NAME (typenode) = get_identifier (derived->name);
TYPE_PACKED (typenode) = flag_pack_derived;
+  if (!got_canonical)
+ SET_TYPE_STRUCTURAL_EQUALITY (typenode);
derived->backend_decl = typenode;
  }


-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter


[patch] vxworks: Sync the aarch64-vx7r2 libgcc config with that of Linux

2020-11-03 Thread Olivier Hainque

This adds ${cpu_type}/t-lse and t-slibgcc-libgcc to the tmake_file
list for aarch64-vxworks7* configurations, as the Linux port does.

t-lse is needed by all triplets now anyway and the standard setting
for slibgcc makes sense as we are working on reintroducing PIC support
for RTPs on various targets. The VxWorks7 system environments are leaning
towards more and more similarilties with Linux in general, so the
closer configurations the better.

Checked that this restores the build for --target=aarch64-vxworks7r2
(previously failing from the missing t-lse part), and that we get resonable
test results back after the change posted at:
https://gcc.gnu.org/pipermail/gcc-patches/2020-October/556216.html.

Olivier

2020-11-02  Pat Bernardi  

libgcc/
* config.host (aarch64-vxworks7*, tmake_file): Add
${cpu_type}/t-lse and t-slibgcc-libgcc.

Co-authored-by: Olivier Hainque  

diff --git a/libgcc/config.host b/libgcc/config.host
index 40823f0cff43..66af8343a286 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -393,6 +393,7 @@ aarch64*-*-vxworks7*)
extra_parts="$extra_parts crtfastmath.o"
md_unwind_header=aarch64/aarch64-unwind.h
tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+   tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
;;
 alpha*-*-linux*)
-- 
2.17.1






[PATCH] tree-optimization/80928 - SLP vectorize nested loop induction

2020-11-03 Thread Richard Biener
This adds SLP vectorization of nested inductions.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2020-11-03  Richard Biener 

PR tree-optimization/80928
* tree-vect-loop.c (vectorizable_induction): SLP vectorize
nested inductions.

* gcc.dg/vect/vect-outer-slp-2.c: New testcase.
* gcc.dg/vect/vect-outer-slp-3.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/vect-outer-slp-2.c |  51 
 gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c |  62 ++
 gcc/tree-vect-loop.c | 116 ---
 3 files changed, 164 insertions(+), 65 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-outer-slp-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-slp-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-outer-slp-2.c
new file mode 100644
index 000..08b4fc52430
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-slp-2.c
@@ -0,0 +1,51 @@
+/* { dg-require-effective-target vect_double } */
+/* { dg-require-effective-target vect_intdouble_cvt } */
+
+#include "tree-vect.h"
+
+double image[40];
+
+void __attribute__((noipa))
+foo (void)
+{
+  for (int i = 0; i < 20; i++)
+{
+  double suma = 0;
+  double sumb = 0;
+  for (int j = 0; j < 40; j++)
+   {
+ suma += j+i;
+ sumb += j+i;
+   }
+  image[2*i] = suma;
+  image[2*i+1] = sumb;
+}
+}
+
+int main ()
+{
+  check_vect ();
+
+  foo ();
+
+  for (int i = 0; i < 20; i++)
+{
+  double suma = 0;
+  double sumb = 0;
+  for (int j = 0; j < 40; j++)
+   {
+ suma += j+i;
+ sumb += j+i;
+ asm ("" : "+g" (suma));
+ asm ("" : "+g" (sumb));
+   }
+  if (image[2*i] != suma
+ || image[2*i+1] != sumb)
+   abort ();
+}
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c
new file mode 100644
index 000..c67d3690bb4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c
@@ -0,0 +1,62 @@
+/* { dg-require-effective-target vect_double } */
+/* { dg-require-effective-target vect_intdouble_cvt } */
+
+#include "tree-vect.h"
+
+double image[40];
+
+void __attribute__((noipa))
+foo (void)
+{
+  for (int i = 0; i < 20; i++)
+{
+  double suma = 0;
+  double sumb = 0;
+  int k = image[2*i];
+  int l = image[2*i+1];
+  for (int j = 0; j < 40; j++)
+{
+  suma += k+i;
+  sumb += l+i;
+  k++;
+  l++;
+}
+  image[2*i] = suma;
+  image[2*i+1] = sumb;
+}
+}
+
+int main ()
+{
+  check_vect ();
+
+  for (int i = 0; i < 40; ++i)
+image[i] = 1.;
+
+  foo ();
+
+  for (int i = 0; i < 20; i++)
+{
+  double suma = 0;
+  double sumb = 0;
+  int k = 1;
+  int l = 1;
+  for (int j = 0; j < 40; j++)
+   {
+  suma += k+i;
+  sumb += l+i;
+ asm ("" : "+g" (suma));
+ asm ("" : "+g" (sumb));
+  k++;
+  l++;
+   }
+  if (image[2*i] != suma
+ || image[2*i+1] != sumb)
+   abort ();
+}
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } 
} */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 6fa185daa28..41e2e2ade20 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -7686,7 +7686,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned i;
   tree expr;
-  gimple_seq stmts;
   gimple_stmt_iterator si;
 
   gphi *phi = dyn_cast  (stmt_info->stmt);
@@ -7726,10 +7725,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
  return false;
}
 
-  /* FORNOW: outer loop induction with SLP not supported.  */
-  if (STMT_SLP_TYPE (stmt_info))
-   return false;
-
   exit_phi = NULL;
   latch_e = loop_latch_edge (loop->inner);
   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
@@ -7875,32 +7870,37 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   /* Now generate the IVs.  */
   unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
   gcc_assert ((const_nunits * nvects) % group_size == 0);
-  unsigned nivs = least_common_multiple (group_size,
-const_nunits) / const_nunits;
-  unsigned lup_mul = (nvects * const_nunits) / group_size;
+  unsigned nivs;
+  if (nested_in_vect_loop)
+   nivs = nvects;
+  else
+   nivs = least_common_multiple (group_size,
+ const_nunits) / const_nunits;
   tree stept = TREE_TYPE (step_vectype);
-  tre

Re: [PATCH] fortran/97652 - workaround missing canonicalization of PDT types

2020-11-03 Thread Richard Biener
On Tue, 3 Nov 2020, Paul Richard Thomas wrote:

> Hi Richi,
> 
> That's OK for master and as far back as you have the fortitude to go.

We only know that the issue is exposed on trunk so I'll leave it
there only as a temporary workaround.

> Thanks for the fix.
> 
> I agree that we should always pick up a canonical variant. When I finally
> get "a minute or two" to fix the rather fundamental problems with PDTs, I
> will make sure that this goes away. I am also curious about the time out -
> I'll take a look.

Note the timeout happens because we optimize one loop to an endless
loop, the reason is mentioned in the PRs audit-trail.

Thanks,
Richard.

> 
> Paul
> 
> 
> On Tue, 3 Nov 2020 at 11:35, Richard Biener  wrote:
> 
> > This marks PDT types as needing structural comparison for TBAA
> > if we didn't pick up a canonical variant (which we should IMHO
> > always do).  This workaround fixes the gfortran.dg/pdt_14.f03
> > fail which materializes as testsuite timeout which is quite
> > annoying.
> >
> > Bootstrap / regtest pending on x86_64-unknown-linux-gnu.
> >
> > OK?
> >
> > 2020-11-03  Richard Biener  
> >
> > PR fortran/97652
> > gcc/fortran
> > * trans-types.c (gfc_get_derived_type): When we didn't find
> > a canonical type mark it for structual equality.
> > ---
> >  gcc/fortran/trans-types.c | 2 ++
> >  1 file changed, 2 insertions(+)
> >
> > diff --git a/gcc/fortran/trans-types.c b/gcc/fortran/trans-types.c
> > index b7129dcbe6d..4643fff243f 100644
> > --- a/gcc/fortran/trans-types.c
> > +++ b/gcc/fortran/trans-types.c
> > @@ -2647,6 +2647,8 @@ gfc_get_derived_type (gfc_symbol * derived, int
> > codimen)
> >typenode = make_node (RECORD_TYPE);
> >TYPE_NAME (typenode) = get_identifier (derived->name);
> >TYPE_PACKED (typenode) = flag_pack_derived;
> > +  if (!got_canonical)
> > +   SET_TYPE_STRUCTURAL_EQUALITY (typenode);
> >derived->backend_decl = typenode;
> >  }
> >
> > --
> > 2.26.2
> >
> 
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend


Re: [PATCH] Fix PR97205

2020-11-03 Thread Bernd Edlinger


On 11/3/20 11:16 AM, Richard Biener wrote:
> On Tue, 3 Nov 2020, Bernd Edlinger wrote:
> 
>>
>>
>> On 11/3/20 10:34 AM, Richard Biener wrote:
>>> On Mon, 2 Nov 2020, Bernd Edlinger wrote:
>>>
 On 11/2/20 3:07 PM, Richard Biener wrote:
> On Mon, 2 Nov 2020, Bernd Edlinger wrote:
>
>> Hi,
>>
>> this makes sure that stack allocated SSA_NAMEs are
>> at least MODE_ALIGNED.  Also increase the MEM_ALIGN
>> for the corresponding rtl objects.
>>
>>
>> Tested on x86_64-pc-linux-gnu and arm-none-eabi.
>>
>> OK for trunk?
>
>
> @@ -1022,6 +1030,14 @@ expand_one_stack_var_at (tree decl, rtx base,
> unsigned base_align,
>  }
>  
>set_rtl (decl, x);
> +
> +  if (TREE_CODE (decl) == SSA_NAME
> +  && GET_MODE (x) != BLKmode
> +  && MEM_ALIGN (x) < GET_MODE_ALIGNMENT (GET_MODE (x)))
> +{
> +  gcc_checking_assert (GET_MODE_ALIGNMENT (GET_MODE (x)) <=
> base_align);
> +  set_mem_align (x, GET_MODE_ALIGNMENT (GET_MODE (x)));
> +}
>  }
>  
>
> I wonder whether we cannot "fix" set_rtl to not call
> set_mem_attributes in this path, maybe directly call
> set_mem_align there instead?  That is, the preceeding
> code for ! SSA_NAME already tries to adjust alignment
> to honor that of the actual stack slot - IMHO the
> non-SSA and SSA cases should be merged after this
> patch, but maybe simply by calling set_mem_align
> instead of doing the DECL_ALIGN frobbing and do
> the alignment compute also for SSA_NAMEs?
>
> The other pieces look OK but the above is a bit ugly
> at the moment.
>

 Hmm, how about this?
>>>
>>> That would work for me.  Guess removing the DECL_ALIGN frobbing
>>> in the != SSA_NAME path didn't work out or you didn't try out
>>> of caution?
>>>
>>
>> I didn't try, since it felt simply more correct this way,
>> and get_object_alignment would probably give a different
>> answer since it uses DECL_ALIGN too.
> 
> OK, I see.
> 
> Richard.
> 

Ok, good.

So this is what I will commit shortly.

Thanks
Bernd.
From e281da96736655c42296806162d1b979dd368544 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger 
Date: Sun, 1 Nov 2020 07:32:20 +0100
Subject: [PATCH] Fix PR97205

This makes sure that stack allocated SSA_NAMEs are
at least MODE_ALIGNED.  Also increase the MEM_ALIGN
for the corresponding rtl objects.

gcc:
2020-11-03  Bernd Edlinger  

	PR target/97205
	* cfgexpand.c (align_local_variable): Make SSA_NAMEs
	at least MODE_ALIGNED.
	(expand_one_stack_var_at): Increase MEM_ALIGN for SSA_NAMEs.

testsuite:
2020-11-03  Bernd Edlinger  

	PR target/97205
	* gcc.c-torture/compile/pr97205.c: New test.
---
 gcc/cfgexpand.c   | 43 ---
 gcc/testsuite/gcc.c-torture/compile/pr97205.c |  7 +
 2 files changed, 33 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.c-torture/compile/pr97205.c

diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
index f3f17d3..6c41a7e 100644
--- a/gcc/cfgexpand.c
+++ b/gcc/cfgexpand.c
@@ -366,7 +366,15 @@ align_local_variable (tree decl, bool really_expand)
   unsigned int align;
 
   if (TREE_CODE (decl) == SSA_NAME)
-align = TYPE_ALIGN (TREE_TYPE (decl));
+{
+  tree type = TREE_TYPE (decl);
+  machine_mode mode = TYPE_MODE (type);
+
+  align = TYPE_ALIGN (type);
+  if (mode != BLKmode
+	  && align < GET_MODE_ALIGNMENT (mode))
+	align = GET_MODE_ALIGNMENT (mode);
+}
   else
 {
   align = LOCAL_DECL_ALIGNMENT (decl);
@@ -999,20 +1007,21 @@ expand_one_stack_var_at (tree decl, rtx base, unsigned base_align,
   x = plus_constant (Pmode, base, offset);
   x = gen_rtx_MEM (TREE_CODE (decl) == SSA_NAME
 		   ? TYPE_MODE (TREE_TYPE (decl))
-		   : DECL_MODE (SSAVAR (decl)), x);
+		   : DECL_MODE (decl), x);
+
+  /* Set alignment we actually gave this decl if it isn't an SSA name.
+ If it is we generate stack slots only accidentally so it isn't as
+ important, we'll simply set the alignment directly on the MEM.  */
+
+  if (base == virtual_stack_vars_rtx)
+offset -= frame_phase;
+  align = known_alignment (offset);
+  align *= BITS_PER_UNIT;
+  if (align == 0 || align > base_align)
+align = base_align;
 
   if (TREE_CODE (decl) != SSA_NAME)
 {
-  /* Set alignment we actually gave this decl if it isn't an SSA name.
- If it is we generate stack slots only accidentally so it isn't as
-	 important, we'll simply use the alignment that is already set.  */
-  if (base == virtual_stack_vars_rtx)
-	offset -= frame_phase;
-  align = known_alignment (offset);
-  align *= BITS_PER_UNIT;
-  if (align == 0 || align > base_align)
-	align = base_align;
-
   /* One would think that we could assert that we're not decreasing
 	 alignment here, but (at least) the i386 port does exactly this
 	 via the MINIMUM_ALIGNMENT hook.  */
@@ -1022,6 +1031,8 @@ expand_one_st

[PATCH] Save some memory at debug stream-in time

2020-11-03 Thread Richard Biener
This allows us to release references to BLOCKs by not keeping
them rooted in the external_die_map but instead remove it from
there as soon as we created the corresponding stub DIE.  For
decls it doesn't help since we still keep the decl_die_table.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

2020-11-03  Richard Biener  

* dwarf2out.c (maybe_create_die_with_external_ref): Remove
hashtable entry.
---
 gcc/dwarf2out.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c
index 534877babfb..64ac94a8cbd 100644
--- a/gcc/dwarf2out.c
+++ b/gcc/dwarf2out.c
@@ -5974,6 +5974,7 @@ maybe_create_die_with_external_ref (tree decl)
 
   const char *sym = desc->sym;
   unsigned HOST_WIDE_INT off = desc->off;
+  external_die_map->remove (decl);
 
   in_lto_p = false;
   dw_die_ref die = (TREE_CODE (decl) == BLOCK
-- 
2.26.2


[PATCH 5/X] libsanitizer: mid-end: Introduce stack variable handling for HWASAN

2020-11-03 Thread Matthew Malcomson via Gcc-patches
Hi Richard,

I'm sending up the revised patch 5 (introducing stack variable handling)
without the other changes to other patches.

I figure there's been quite a lot of changes to this patch and I wanted
to give you time to review them while I worked on finishing the less
widespread changes in patch 6 and before I ran the more exhaustive (and
time-consuming) tests in case you didn't like the changes and those
exhaustive tests would just have to get repeated.

The big differences between this and the last version are:

- I moved all helper functions which rely on how the tag is encoded into
  hooks.  This allows backends to choose a different ABI for hwasan
  tagging.  That said, any ABI which doesn't ensure the entire the tag
  is stored as a top byte is unsupported as the library doesn't handle
  anything else.

- I no longer delay emitting RTL to initialise the hwasan_base_pointer.
  It's now emitted as soon as the pointer is used anywhere.

- No longer delay allocating stack variables for hwasan to work.
  Instead we record stack variables when allocating through
  expand_stack_var_1 *and* when allocating through expand_stack_vars.

- Use `frame_offset` in more places to avoid having to manually handle
  the case of `!FRAME_GROWS_DOWNWARDS`.





Handling stack variables has three features.

1) Ensure HWASAN required alignment for stack variables

When tagging shadow memory, we need to ensure that each tag granule is
only used by one variable at a time.

This is done by ensuring that each tagged variable is aligned to the tag
granule representation size and also ensure that the end of each
object is aligned to ensure the start of any other data stored on the
stack is in a different granule.

This patch ensures the above by adding alignment requirements in
`align_local_variable` and forcing the stack pointer to be aligned before
allocating any stack objects.

2) Put tags into each stack variable pointer

Make sure that every pointer to a stack variable includes a tag of some
sort on it.

The way tagging works is:
  1) For every new stack frame, a random tag is generated.
  2) A base register is formed from the stack pointer value and this
 random tag.
  3) References to stack variables are now formed with RTL describing an
 offset from this base in both tag and value.

The random tag generation is handled by a backend hook.  This hook
decides whether to introduce a random tag or use the stack background
based on the parameter hwasan-random-frame-tag.  Using the stack
background is necessary for testing and bootstrap.  It is necessary
during bootstrap to avoid breaking the `configure` test program for
determining stack direction.

Using the stack background means that every stack frame has the initial
tag of zero and variables are tagged with incrementing tags from 1,
which also makes debugging a bit easier.

The tag&value offsets are also handled by a backend hook.

This patch also adds some macros defining how the HWASAN shadow memory
is stored and how a tag is stored in a pointer.

3) For each stack variable, tag and untag the shadow stack on function
   prologue and epilogue.

On entry to each function we tag the relevant shadow stack region for
each stack variable the tag to match the tag added to each pointer for
that variable.

This is the first patch where we use the HWASAN shadow space, so we need
to add in the libhwasan initialisation code that creates this shadow
memory region into the binary we produce.  This instrumentation is done
in `compile_file`.

When exiting a function we need to ensure the shadow stack for this
function has no remaining tag.  Without clearing the shadow stack area
for this stack frame, later function calls could get false positives
when those later function calls check untagged areas (such as parameters
passed on the stack) against a shadow stack area with left-over tag.

Hence we ensure that the entire stack frame is cleared on function exit.


ChangeLog:

* config/bootstrap-hwasan.mk: Disable random frame tags for
stack-tagging during bootstrap.
* gcc/asan.c (struct hwasan_stack_var): New.
(hwasan_sanitize_p): New.
(hwasan_sanitize_stack_p): New.
(hwasan_sanitize_allocas_p): New.
(initialize_sanitizer_builtins): Define new builtins.
(ATTR_NOTHROW_LIST): New macro.
(hwasan_current_frame_tag): New.
(hwasan_frame_base): New.
(hwasan_record_stack_var): New.
(hwasan_get_frame_extent): New.
(hwasan_increment_frame_tag): New.
(hwasan_record_frame_init): New.
(hwasan_emit_prologue): New.
(hwasan_emit_untag_frame): New.
(hwasan_finish_file): New.
(hwasan_truncate_to_tag_size): New.
* gcc/asan.h (hwasan_record_frame_init): New declaration.
(hwasan_record_stack_var): New declaration.
(hwasan_emit_prologue): New declaration.
(hw

libcpp: dependency emission tidying

2020-11-03 Thread Nathan Sidwell


This patch cleans up the interface to the dependency generation a
little.  We now only check the option in one place, and the
cpp_get_deps function returns nullptr if there are no dependencies.  I
also reworded the -MT and -MQ help text to be make agnostic -- as
there are ideas about emitting, say, JSON.

libcpp/
* include/mdeps.h: Include cpplib.h
(deps_write): Adjust first parm type.
* mkdeps.c: Include internal.h
(make_write): Adjust first parm type.  Check phony option
directly.
(deps_write): Adjust first parm type.
* init.c (cpp_read_main_file): Use get_deps.
* directives.c (cpp_get_deps): Check option before initializing.
gcc/c-family/
* c.opt (MQ,MT): Reword description to be make-agnostic.
gcc/fortran/
* cpp.c (gfc_cpp_add_dep): Only add dependency if we're recording
them.
(gfc_cpp_init): Likewise for target.

pushing to trunk

--
Nathan Sidwell
diff --git c/gcc/c-family/c.opt w/gcc/c-family/c.opt
index 10e53ea67c9..426636be839 100644
--- c/gcc/c-family/c.opt
+++ w/gcc/c-family/c.opt
@@ -242,11 +242,11 @@ Generate phony targets for all headers.
 
 MQ
 C ObjC C++ ObjC++ Joined Separate MissingArgError(missing makefile target after %qs)
--MQ 	Add a MAKE-quoted target.
+-MQ 	Add a target that may require quoting.
 
 MT
 C ObjC C++ ObjC++ Joined Separate MissingArgError(missing makefile target after %qs)
--MT 	Add an unquoted target.
+-MT 	Add a target that does not require quoting.
 
 P
 C ObjC C++ ObjC++
diff --git c/gcc/fortran/cpp.c w/gcc/fortran/cpp.c
index dcde5576cd5..51baf141711 100644
--- c/gcc/fortran/cpp.c
+++ w/gcc/fortran/cpp.c
@@ -222,13 +222,15 @@ void
 gfc_cpp_add_dep (const char *name, bool system)
 {
   if (!gfc_cpp_option.deps_skip_system || !system)
-deps_add_dep (cpp_get_deps (cpp_in), name);
+if (mkdeps *deps = cpp_get_deps (cpp_in))
+  deps_add_dep (deps, name);
 }
 
 void
 gfc_cpp_add_target (const char *name)
 {
-  deps_add_target (cpp_get_deps (cpp_in), name, 0);
+  if (mkdeps *deps = cpp_get_deps (cpp_in))
+deps_add_target (deps, name, 0);
 }
 
 
@@ -605,8 +607,8 @@ gfc_cpp_init (void)
 	cpp_assert (cpp_in, opt->arg);
 	}
   else if (opt->code == OPT_MT || opt->code == OPT_MQ)
-	deps_add_target (cpp_get_deps (cpp_in),
-			 opt->arg, opt->code == OPT_MQ);
+	if (mkdeps *deps = cpp_get_deps (cpp_in))
+	  deps_add_target (deps, opt->arg, opt->code == OPT_MQ);
 }
 
   /* Pre-defined macros for non-required INTEGER kind types.  */
diff --git c/libcpp/directives.c w/libcpp/directives.c
index d7b59aae901..4295a67f1e5 100644
--- c/libcpp/directives.c
+++ w/libcpp/directives.c
@@ -2572,7 +2572,7 @@ cpp_set_callbacks (cpp_reader *pfile, cpp_callbacks *cb)
 class mkdeps *
 cpp_get_deps (cpp_reader *pfile)
 {
-  if (!pfile->deps)
+  if (!pfile->deps && CPP_OPTION (pfile, deps.style) != DEPS_NONE)
 pfile->deps = deps_init ();
   return pfile->deps;
 }
diff --git c/libcpp/include/mkdeps.h w/libcpp/include/mkdeps.h
index 6d05351cb4a..593b718aaeb 100644
--- c/libcpp/include/mkdeps.h
+++ w/libcpp/include/mkdeps.h
@@ -23,6 +23,8 @@ along with this program; see the file COPYING3.  If not see
 #ifndef LIBCPP_MKDEPS_H
 #define LIBCPP_MKDEPS_H
 
+#include "cpplib.h"
+
 /* This is the data structure used by all the functions in mkdeps.c.
It's quite straightforward, but should be treated as opaque.  */
 
@@ -55,9 +57,9 @@ extern void deps_add_default_target (class mkdeps *, const char *);
dependency entered should be the primary source file.  */
 extern void deps_add_dep (class mkdeps *, const char *);
 
-/* Write out a deps buffer to a specified file.  The third argument
+/* Write out a deps buffer to a specified file.  The last argument
is the number of columns to word-wrap at (0 means don't wrap).  */
-extern void deps_write (const class mkdeps *, FILE *, bool, unsigned int);
+extern void deps_write (const cpp_reader *, FILE *, unsigned int);
 
 /* Write out a deps buffer to a file, in a form that can be read back
with deps_restore.  Returns nonzero on error, in which case the
diff --git c/libcpp/init.c w/libcpp/init.c
index 454a183134a..5b2607e3767 100644
--- c/libcpp/init.c
+++ w/libcpp/init.c
@@ -667,14 +667,9 @@ cpp_post_options (cpp_reader *pfile)
 const char *
 cpp_read_main_file (cpp_reader *pfile, const char *fname, bool injecting)
 {
-  if (CPP_OPTION (pfile, deps.style) != DEPS_NONE)
-{
-  if (!pfile->deps)
-	pfile->deps = deps_init ();
-
-  /* Set the default target (if there is none already).  */
-  deps_add_default_target (pfile->deps, fname);
-}
+  if (mkdeps *deps = cpp_get_deps (pfile))
+/* Set the default target (if there is none already).  */
+deps_add_default_target (pfile->deps, fname);
 
   pfile->main_file
 = _cpp_find_file (pfile, fname, &pfile->no_search_path, /*angle=*/0,
@@ -813,9 +808,8 @@ cpp_finish (cpp_reader *pfile, FILE *deps_stream)
   while (pfile->buffer)
 _cpp

Re: [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32

2020-11-03 Thread Dennis Zhang via Gcc-patches



On 11/2/20 7:05 PM, Richard Sandiford wrote:

Dennis Zhang  writes:

Hi Richard,

On 10/29/20 5:48 PM, Richard Sandiford wrote:

Dennis Zhang  writes:

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
b/gcc/config/aarch64/aarch64-simd-builtins.def
index 5bc596dbffc..b68c3ca7f4b 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -732,3 +732,8 @@
 VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
 VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
 VAR1 (UNOP, bfcvt, 0, ALL, bf)
+
+  /* Implemented by aarch64_{v}bfcvt{_high}.  */
+  VAR2 (UNOP, vbfcvt, 0, ALL, v4bf, v8bf)
+  VAR1 (UNOP, vbfcvt_high, 0, ALL, v8bf)
+  VAR1 (UNOP, bfcvt, 0, ALL, sf)


New intrinsics should use something more specific than “ALL”.
Since these functions are pure non-trapping integer operations,
I think they should use “AUTO_FP” instead.  (On reflection,
we should probably change the name.)


+(define_insn "aarch64_bfcvtsf"
+  [(set (match_operand:SF 0 "register_operand" "=w")
+   (unspec:SF [(match_operand:BF 1 "register_operand" "w")]
+   UNSPEC_BFCVT))]
+  "TARGET_BF16_FP"
+  "shl\\t%d0, %d1, #16"
+  [(set_attr "type" "neon_shift_reg")]


I think this should be neon_shift_imm instead.

OK with those changes, thanks.

Richard



I've fixed the Flag and the insn attribute.
I will commit it if no further issues.


LGTM, thanks.

Richard


Thanks Richard!
This patch is committed as f7d6961126a7f06c8089d8a58bd21be43bc16806.

Bests
Dennis


c++: rtti cleanups

2020-11-03 Thread Nathan Sidwell


Here are a few cleanups from the modules branch.  Generally some RAII,
and a bit of lazy namespace pushing.

gcc/cp/
* rtti.c (init_rtti_processing): Move var decl to its init.
(get_tinfo_decl): Likewise.  Break out creation to called helper
...
(get_tinfo_decl_direct): ... here.
(build_dynamic_cast_1): Move var decls to their initializers.
(tinfo_base_init): Set decl's location to BUILTINS_LOCATION.
(get_tinfo_desc): Only push ABI namespace when needed.  Set type's
context.

pushing to trunk

--
Nathan Sidwell
diff --git i/gcc/cp/rtti.c w/gcc/cp/rtti.c
index 7c4bff76e8c..887aae31bf6 100644
--- i/gcc/cp/rtti.c
+++ w/gcc/cp/rtti.c
@@ -123,6 +123,7 @@ static GTY (()) vec *tinfo_descs;
 
 static tree ifnonnull (tree, tree, tsubst_flags_t);
 static tree tinfo_name (tree, bool);
+static tree get_tinfo_decl_direct (tree type, tree name, int pseudo_ix);
 static tree build_dynamic_cast_1 (location_t, tree, tree, tsubst_flags_t);
 static tree throw_bad_cast (void);
 static tree throw_bad_typeid (void);
@@ -166,10 +167,8 @@ pop_abi_namespace (void)
 void
 init_rtti_processing (void)
 {
-  tree type_info_type;
-
   push_nested_namespace (std_node);
-  type_info_type = xref_tag (class_type, get_identifier ("type_info"));
+  tree type_info_type = xref_tag (class_type, get_identifier ("type_info"));
   pop_nested_namespace (std_node);
   const_type_info_type_node
 = cp_build_qualified_type (type_info_type, TYPE_QUAL_CONST);
@@ -414,9 +413,6 @@ tinfo_name (tree type, bool mark_private)
 tree
 get_tinfo_decl (tree type)
 {
-  tree name;
-  tree d;
-
   if (variably_modified_type_p (type, /*fn=*/NULL_TREE))
 {
   error ("cannot create type information for type %qT because "
@@ -429,25 +425,41 @@ get_tinfo_decl (tree type)
 type = build_function_type (TREE_TYPE (type),
 TREE_CHAIN (TYPE_ARG_TYPES (type)));
 
-  type = complete_type (type);
+  return get_tinfo_decl_direct (type, NULL, -1);
+}
 
+/* Get or create a tinfo VAR_DECL directly from the provided information.
+   The caller must have already checked it is valid to do so.  */
+
+static tree
+get_tinfo_decl_direct (tree type, tree name, int pseudo_ix)
+{
   /* For a class type, the variable is cached in the type node
  itself.  */
+  tree d = NULL_TREE;
+
+  gcc_checking_assert (TREE_CODE (type) != METHOD_TYPE);
+
+  if (pseudo_ix < 0)
+type = complete_type (type);
+
   if (CLASS_TYPE_P (type))
-{
-  d = CLASSTYPE_TYPEINFO_VAR (TYPE_MAIN_VARIANT (type));
-  if (d)
-	return d;
-}
+d = CLASSTYPE_TYPEINFO_VAR (TYPE_MAIN_VARIANT (type));
+
+  if (!name)
+name = mangle_typeinfo_for_type (type);
 
-  name = mangle_typeinfo_for_type (type);
+  if (!CLASS_TYPE_P (type))
+d = get_global_binding (name);
 
-  d = get_global_binding (name);
   if (!d)
 {
-  int ix = get_pseudo_ti_index (type);
-  const tinfo_s *ti = get_tinfo_desc (ix);
-  
+  /* Create it.  */
+  if (pseudo_ix < 0)
+	pseudo_ix = get_pseudo_ti_index (type);
+
+  const tinfo_s *ti = get_tinfo_desc (pseudo_ix);
+
   d = build_lang_decl (VAR_DECL, name, ti->type);
   SET_DECL_ASSEMBLER_NAME (d, name);
   /* Remember the type it is for.  */
@@ -754,23 +766,21 @@ build_dynamic_cast_1 (location_t loc, tree type, tree expr,
 	  dcast_fn = dynamic_cast_node;
 	  if (!dcast_fn)
 	{
-	  tree tmp;
-	  tree tinfo_ptr;
-	  const char *name;
-
 	  push_abi_namespace ();
-	  tinfo_ptr = xref_tag (class_type,
-get_identifier ("__class_type_info"));
-	  tinfo_ptr = build_pointer_type
-		(cp_build_qualified_type
-		 (tinfo_ptr, TYPE_QUAL_CONST));
-	  name = "__dynamic_cast";
-	  tmp = build_function_type_list (ptr_type_node,
-	  const_ptr_type_node,
-	  tinfo_ptr, tinfo_ptr,
-	  ptrdiff_type_node, NULL_TREE);
-	  dcast_fn = build_library_fn_ptr (name, tmp,
-	   ECF_LEAF | ECF_PURE | ECF_NOTHROW);
+	  tree tinfo_ptr = xref_tag (class_type,
+	 get_identifier ("__class_type_info"));
+	  tinfo_ptr = cp_build_qualified_type (tinfo_ptr, TYPE_QUAL_CONST);
+	  tinfo_ptr = build_pointer_type (tinfo_ptr);
+
+	  const char *fn_name = "__dynamic_cast";
+	  /* void *() (void const *, __class_type_info const *,
+		   __class_type_info const *, ptrdiff_t)  */
+	  tree fn_type = (build_function_type_list
+			  (ptr_type_node, const_ptr_type_node,
+			   tinfo_ptr, tinfo_ptr, ptrdiff_type_node,
+			   NULL_TREE));
+	  dcast_fn = (build_library_fn_ptr
+			  (fn_name, fn_type, ECF_LEAF | ECF_PURE | ECF_NOTHROW));
 	  pop_abi_namespace ();
 	  dynamic_cast_node = dcast_fn;
 	}
@@ -947,6 +957,8 @@ tinfo_base_init (tinfo_s *ti, tree target)
 {
   push_abi_namespace ();
   tree real_type = xref_tag (class_type, ti->name);
+  tree real_decl = TYPE_NAME (real_type);
+  DECL_SOURCE_LOCATION (real_decl) = BUILTINS_LOCATION;
 

c++: cp_tree_equal cleanups

2020-11-03 Thread Nathan Sidwell


A couple of small fixes.  I noticed bind_template_template_parms was
not marking the parm a template parm (this broke some module
handling).  Debugging CALL_EXPR comparisons led me to refactor
cp_tree_equal's CALL_EXPR code (and my recent fix to debug printing of
same).  Finally TREE_VECS are best compared by comp_template_args.  I
recall that last piece being a left over from fixes during gcc-10.
I've been using it on the modules branch since then.

gcc/cp/
* tree.c (bind_template_template_parm): Mark the parm as a
template parm.
(cp_tree_equal): Refactor CALL_EXPR.  Use comp_template_args for
TREE_VECs.

pushing to trunk

--
Nathan Sidwell
diff --git i/gcc/cp/tree.c w/gcc/cp/tree.c
index 9bc37aca95b..3087c4ab52c 100644
--- i/gcc/cp/tree.c
+++ w/gcc/cp/tree.c
@@ -2700,6 +2700,7 @@ bind_template_template_parm (tree t, tree newargs)
   t2 = cxx_make_type (BOUND_TEMPLATE_TEMPLATE_PARM);
   decl = build_decl (input_location,
 		 TYPE_DECL, DECL_NAME (decl), NULL_TREE);
+  SET_DECL_TEMPLATE_PARM_P (decl);
 
   /* These nodes have to be created to reflect new TYPE_DECL and template
  arguments.  */
@@ -3671,20 +3672,28 @@ cp_tree_equal (tree t1, tree t2)
 
 case CALL_EXPR:
   {
-	tree arg1, arg2;
-	call_expr_arg_iterator iter1, iter2;
-	if (KOENIG_LOOKUP_P (t1) != KOENIG_LOOKUP_P (t2)
-	|| !called_fns_equal (CALL_EXPR_FN (t1), CALL_EXPR_FN (t2)))
+	if (KOENIG_LOOKUP_P (t1) != KOENIG_LOOKUP_P (t2))
 	  return false;
-	for (arg1 = first_call_expr_arg (t1, &iter1),
-	   arg2 = first_call_expr_arg (t2, &iter2);
-	 arg1 && arg2;
-	 arg1 = next_call_expr_arg (&iter1),
-	   arg2 = next_call_expr_arg (&iter2))
-	  if (!cp_tree_equal (arg1, arg2))
-	return false;
-	if (arg1 || arg2)
+
+	if (!called_fns_equal (CALL_EXPR_FN (t1), CALL_EXPR_FN (t2)))
+	  return false;
+
+	call_expr_arg_iterator iter1, iter2;
+	init_call_expr_arg_iterator (t1, &iter1);
+	init_call_expr_arg_iterator (t2, &iter2);
+	if (iter1.n != iter2.n)
 	  return false;
+
+	while (more_call_expr_args_p (&iter1))
+	  {
+	tree arg1 = next_call_expr_arg (&iter1);
+	tree arg2 = next_call_expr_arg (&iter2);
+
+	gcc_checking_assert (arg1 && arg2);
+	if (!cp_tree_equal (arg1, arg2))
+	  return false;
+	  }
+
 	return true;
   }
 
@@ -3779,16 +3788,11 @@ cp_tree_equal (tree t1, tree t2)
  CHECK_CONSTR_ARGS (t2)));
 
 case TREE_VEC:
-  {
-	unsigned ix;
-	if (TREE_VEC_LENGTH (t1) != TREE_VEC_LENGTH (t2))
-	  return false;
-	for (ix = TREE_VEC_LENGTH (t1); ix--;)
-	  if (!cp_tree_equal (TREE_VEC_ELT (t1, ix),
-			  TREE_VEC_ELT (t2, ix)))
-	return false;
-	return true;
-  }
+  /* These are template args.  Really we should be getting the
+	 caller to do this as it knows it to be true.  */
+  if (!comp_template_args (t1, t2, NULL, NULL, false))
+	return false;
+  return true;
 
 case SIZEOF_EXPR:
 case ALIGNOF_EXPR:


Re: [PATCH] Optimize macro: make it more predictable

2020-11-03 Thread Richard Biener via Gcc-patches
On Fri, Oct 23, 2020 at 1:47 PM Martin Liška  wrote:
>
> Hey.
>
> This is a follow-up of the discussion that happened in thread about 
> no_stack_protector
> attribute: https://gcc.gnu.org/pipermail/gcc-patches/2020-May/545916.html
>
> The current optimize attribute works in the following way:
> - 1) we take current global_options as base
> - 2) maybe_default_options is called for the currently selected optimization 
> level, which
>   means all rules in default_options_table are executed
> - 3) attribute values are applied (via decode_options)
>
> So the step 2) is problematic: in case of -O2 -fno-omit-frame-pointer and 
> __attribute__((optimize("-fno-stack-protector")))
> ends basically with -O2 -fno-stack-protector because -fno-omit-frame-pointer 
> is default:
>  /* -O1 and -Og optimizations.  */
>  { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
>
> My patch handled and the current optimize attribute really behaves that same 
> as appending attribute value
> to the command line. So far so good. We should also reflect that in 
> documentation entry which is quite
> vague right now:
>
> """
> The optimize attribute is used to specify that a function is to be compiled 
> with different optimization options than specified on the command line.
> """
>
> and we may want to handle -Ox in the attribute in a special way. I guess many 
> macro/pragma users expect that
>
> -O2 -ftree-vectorize and __attribute__((optimize(1))) will end with -O1 and 
> not
> with -ftree-vectorize -O1 ?

Hmm.  I guess the only two reasonable options are to append to the active set
and thus end up with -ftree-vectorize -O1 or to start from an empty set and thus
end up with -O1.

Maybe we can have

@item optimize (@var{level}, @dots{})

reset everything to plain -On and

@item optimize (@var{string}, @dots{})

append?  So optimize("O1") will end up with -O2 -ftree-vectorize -O1 and
optimize(1) with -O1?  How do we handle

void __attribute__((optimize(1),optimize("ftree-vectorize")))

thus two optimize attributes?

> I'm also planning to take a look at the target macro/attribute, I expect 
> similar problems:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97469
>
> Thoughts?
> Thanks,
> Martin
>
> gcc/c-family/ChangeLog:
>
> * c-common.c (parse_optimize_options): Decoded attribute options
> with the ones that were already set on the command line.
>
> gcc/ChangeLog:
>
> * toplev.c (toplev::main): Save decoded Optimization options.
> * toplev.h (save_opt_decoded_options): New.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx512er-vrsqrt28ps-3.c: Disable -ffast-math.
> * gcc.target/i386/avx512er-vrsqrt28ps-5.c: Likewise.
> ---
>   gcc/c-family/c-common.c   | 15 ++-
>   .../gcc.target/i386/avx512er-vrsqrt28ps-3.c   |  2 +-
>   .../gcc.target/i386/avx512er-vrsqrt28ps-5.c   |  2 +-
>   gcc/toplev.c  |  8 
>   gcc/toplev.h  |  1 +
>   5 files changed, 25 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
> index e16ca3894bc..d4342e93d0a 100644
> --- a/gcc/c-family/c-common.c
> +++ b/gcc/c-family/c-common.c
> @@ -5727,10 +5727,23 @@ parse_optimize_options (tree args, bool attr_p)
> j++;
>   }
> decoded_options_count = j;
> +
> +  /* Merge the decoded options with save_decoded_options.  */
> +  unsigned save_opt_count = save_opt_decoded_options.length ();
> +  unsigned merged_decoded_options_count = save_opt_count + 
> decoded_options_count;
> +  cl_decoded_option *merged_decoded_options
> += XNEWVEC (cl_decoded_option, merged_decoded_options_count);
> +
> +  for (unsigned i = 0; i < save_opt_count; ++i)
> +merged_decoded_options[i] = save_opt_decoded_options[i];
> +  for (unsigned i = 0; i < decoded_options_count; ++i)
> +merged_decoded_options[save_opt_count + i] = decoded_options[i];
> +
> /* And apply them.  */
> decode_options (&global_options, &global_options_set,
> - decoded_options, decoded_options_count,
> + merged_decoded_options, merged_decoded_options_count,
>   input_location, global_dc, NULL);
> +  free (decoded_options);
>
> targetm.override_options_after_change();
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c 
> b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c
> index 1ba8172d6e3..40aefb50844 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c
> @@ -8,7 +8,7 @@
>   #define MAX 1000
>   #define EPS 0.1
>
> -__attribute__ ((noinline, optimize (1)))
> +__attribute__ ((noinline, optimize (1, "-fno-fast-math")))
>   void static
>   compute_rsqrt_ref (float *a, float *r)
>   {
> diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c 
> b/gcc/testsuite/gcc.target/i386/avx512er-vrsqr

Re: [PATCH 6/x] arm: Add vstN_lane_bf16 + vstNq_lane_bf16 intrisics

2020-11-03 Thread Andrea Corallo via Gcc-patches
Kyrylo Tkachov  writes:

>> -Original Message-
>> From: Andrea Corallo 
>> Sent: 02 November 2020 09:04
>> To: gcc-patches@gcc.gnu.org
>> Cc: Kyrylo Tkachov ; Richard Earnshaw
>> ; nd 
>> Subject: [PATCH 6/x] arm: Add vstN_lane_bf16 + vstNq_lane_bf16 intrisics
>> 
>> Hi all,
>> 
>> last patch for this the serie adding vst2_lane_bf16, vst2q_lane_bf16,
>> vst3_lane_bf16, vst3q_lane_bf16, vst4_lane_bf16, vst4q_lane_bf16
>> related neon intrinsics.
>> 
>> Please see refer to:
>> ACLE 
>> ISA  
>> 
>> Regtested and bootstrapped.
>> 
>
> Ok.
> Thanks,
> Kyrill

Hi Kyrill,

I've installed into master this serie (implementing your suggestion for
3/x).

ed62f3668b5 arm: Add vstN_lane_bf16 + vstNq_lane_bf16 intrisics
1528f34341b arm: Add vldN_lane_bf16 + vldNq_lane_bf16 intrisics
6170a793b7f arm: Add vst1_bf16 + vst1q_bf16 intrinsics
890076673d4 arm: Add vld1_bf16 + vld1q_bf16 intrinsics
d65303b6994 arm: Add vst1_lane_bf16 + vstq_lane_bf16 intrinsics

I'll follow up for the backports if you are okay with that.

Thanks

  Andrea


Re: Add fnspec to C++ new and delete

2020-11-03 Thread Richard Biener
On Tue, 27 Oct 2020, Jan Hubicka wrote:

> Hi,
> this patch makes C++ new and delete operators to be handled as
> malloc/free for fnspec.
> 
> I still do not understand why free is ".co " and not ".cO ".
> I do not think we need to invalidate memory referenced to by blockbeing
> freed.
> 
> Bootstrapped/regtested x86_64-linux, OK?

OK.

Richard.

> Honza
> 
> gcc/ChangeLog:
> 
> 2020-10-27  Jan Hubicka  
> 
>   * gimple.c (gimple_call_fnspec): Handle C++ new and delete.
>   * gimple.h (gimple_call_from_new_or_delete): Constify parameter.
> 
> gcc/testsuite/ChangeLog:
> 
> 2020-10-27  Jan Hubicka  
> 
>   * g++.dg/ipa/devirt-24.C: Update template.
> 
> diff --git a/gcc/gimple.c b/gcc/gimple.c
> index 469e6f369f3..1afed88e1f1 100644
> --- a/gcc/gimple.c
> +++ b/gcc/gimple.c
> @@ -1510,6 +1510,19 @@ gimple_call_fnspec (const gcall *stmt)
>  }
>if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
>  return builtin_fnspec (gimple_call_fndecl (stmt));
> +  tree fndecl = gimple_call_fndecl (stmt);
> +  /* If the call is to a replaceable operator delete and results
> + from a delete expression as opposed to a direct call to
> + such operator, then we can treat it as free.  */
> +  if (fndecl
> +  && DECL_IS_OPERATOR_DELETE_P (fndecl)
> +  && gimple_call_from_new_or_delete (stmt))
> +return ".co ";
> +  /* Similarly operator new can be treated as malloc.  */
> +  if (fndecl
> +  && DECL_IS_OPERATOR_NEW_P (fndecl)
> +  && gimple_call_from_new_or_delete (stmt))
> +return "mC";
>return "";
>  }
>  
> diff --git a/gcc/gimple.h b/gcc/gimple.h
> index 3c9b9965f5a..fdb00d57b07 100644
> --- a/gcc/gimple.h
> +++ b/gcc/gimple.h
> @@ -3405,7 +3405,7 @@ gimple_call_set_from_new_or_delete (gcall *s, bool 
> from_new_or_delete_p)
> from a new or delete expression.  */
>  
>  static inline bool
> -gimple_call_from_new_or_delete (gcall *s)
> +gimple_call_from_new_or_delete (const gcall *s)
>  {
>return (s->subcode & GF_CALL_FROM_NEW_OR_DELETE) != 0;
>  }
> diff --git a/gcc/testsuite/g++.dg/ipa/devirt-24.C 
> b/gcc/testsuite/g++.dg/ipa/devirt-24.C
> index eaef1f5b3f8..7b5b806dd05 100644
> --- a/gcc/testsuite/g++.dg/ipa/devirt-24.C
> +++ b/gcc/testsuite/g++.dg/ipa/devirt-24.C
> @@ -37,4 +37,4 @@ C *b = new (C);
>}
>  }
>  /* { dg-final { scan-ipa-dump-times "Discovered a virtual call to a known 
> target" 1 "inline" { xfail *-*-* } } } */
> -/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 1 "cp"  
> } } */
> +/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 2 "cp"  
> } } */
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend


Re: [PATCH] Optimize macro: make it more predictable

2020-11-03 Thread Jakub Jelinek via Gcc-patches
On Tue, Nov 03, 2020 at 02:27:52PM +0100, Richard Biener wrote:
> On Fri, Oct 23, 2020 at 1:47 PM Martin Liška  wrote:
> > This is a follow-up of the discussion that happened in thread about 
> > no_stack_protector
> > attribute: https://gcc.gnu.org/pipermail/gcc-patches/2020-May/545916.html
> >
> > The current optimize attribute works in the following way:
> > - 1) we take current global_options as base
> > - 2) maybe_default_options is called for the currently selected 
> > optimization level, which
> >   means all rules in default_options_table are executed
> > - 3) attribute values are applied (via decode_options)
> >
> > So the step 2) is problematic: in case of -O2 -fno-omit-frame-pointer and 
> > __attribute__((optimize("-fno-stack-protector")))
> > ends basically with -O2 -fno-stack-protector because 
> > -fno-omit-frame-pointer is default:
> >  /* -O1 and -Og optimizations.  */
> >  { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
> >
> > My patch handled and the current optimize attribute really behaves that 
> > same as appending attribute value
> > to the command line. So far so good. We should also reflect that in 
> > documentation entry which is quite
> > vague right now:
> >
> > """
> > The optimize attribute is used to specify that a function is to be compiled 
> > with different optimization options than specified on the command line.
> > """
> >
> > and we may want to handle -Ox in the attribute in a special way. I guess 
> > many macro/pragma users expect that
> >
> > -O2 -ftree-vectorize and __attribute__((optimize(1))) will end with -O1 and 
> > not
> > with -ftree-vectorize -O1 ?
> 
> Hmm.  I guess the only two reasonable options are to append to the active set
> and thus end up with -ftree-vectorize -O1 or to start from an empty set and 
> thus
> end up with -O1.

I'd say we always want to append, but only take into account explicit
options.
So basically get the effect of
take the command line, append to that options from the optimize/target
pragmas in effect and append to that options from optimize/target
attributes and only from that figure out the implicit options.

Jakub



Re: [PATCH] Optimize macro: make it more predictable

2020-11-03 Thread Richard Biener via Gcc-patches
On Tue, Nov 3, 2020 at 2:35 PM Jakub Jelinek  wrote:
>
> On Tue, Nov 03, 2020 at 02:27:52PM +0100, Richard Biener wrote:
> > On Fri, Oct 23, 2020 at 1:47 PM Martin Liška  wrote:
> > > This is a follow-up of the discussion that happened in thread about 
> > > no_stack_protector
> > > attribute: https://gcc.gnu.org/pipermail/gcc-patches/2020-May/545916.html
> > >
> > > The current optimize attribute works in the following way:
> > > - 1) we take current global_options as base
> > > - 2) maybe_default_options is called for the currently selected 
> > > optimization level, which
> > >   means all rules in default_options_table are executed
> > > - 3) attribute values are applied (via decode_options)
> > >
> > > So the step 2) is problematic: in case of -O2 -fno-omit-frame-pointer and 
> > > __attribute__((optimize("-fno-stack-protector")))
> > > ends basically with -O2 -fno-stack-protector because 
> > > -fno-omit-frame-pointer is default:
> > >  /* -O1 and -Og optimizations.  */
> > >  { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
> > >
> > > My patch handled and the current optimize attribute really behaves that 
> > > same as appending attribute value
> > > to the command line. So far so good. We should also reflect that in 
> > > documentation entry which is quite
> > > vague right now:
> > >
> > > """
> > > The optimize attribute is used to specify that a function is to be 
> > > compiled with different optimization options than specified on the 
> > > command line.
> > > """
> > >
> > > and we may want to handle -Ox in the attribute in a special way. I guess 
> > > many macro/pragma users expect that
> > >
> > > -O2 -ftree-vectorize and __attribute__((optimize(1))) will end with -O1 
> > > and not
> > > with -ftree-vectorize -O1 ?
> >
> > Hmm.  I guess the only two reasonable options are to append to the active 
> > set
> > and thus end up with -ftree-vectorize -O1 or to start from an empty set and 
> > thus
> > end up with -O1.
>
> I'd say we always want to append, but only take into account explicit
> options.
> So basically get the effect of
> take the command line, append to that options from the optimize/target
> pragmas in effect and append to that options from optimize/target
> attributes and only from that figure out the implicit options.

OK, so minus target options that is what martins patch does, right?

Richard.

> Jakub
>


Re: libcpp: dependency emission tidying

2020-11-03 Thread Tobias Burnus

Hi Nathan,

I now get:

../../repos/gcc/libcpp/init.c:670:15: error: unused variable ‘deps’ 
[-Werror=unused-variable]
  670 |   if (mkdeps *deps = cpp_get_deps (pfile))
  |   ^~~~

see last three lines of the quoted patch below.

Tobias

On 03.11.20 14:02, Nathan Sidwell wrote:

  cpp_read_main_file (cpp_reader *pfile, const char *fname, bool injecting)
  {
-  if (CPP_OPTION (pfile, deps.style) != DEPS_NONE)
-{
-  if (!pfile->deps)
- pfile->deps = deps_init ();
-
-  /* Set the default target (if there is none already).  */
-  deps_add_default_target (pfile->deps, fname);
-}
+  if (mkdeps *deps = cpp_get_deps (pfile))
+/* Set the default target (if there is none already).  */
+deps_add_default_target (pfile->deps, fname);

-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter


Re: [PATCH][PR target/97540] Don't extract memory from operand for normal memory constraint.

2020-11-03 Thread Richard Sandiford via Gcc-patches
Vladimir Makarov via Gcc-patches  writes:
> On 2020-10-27 2:53 a.m., Hongtao Liu wrote:
>> Hi:
>>For inline asm, there could be an operand like (not (mem:)), it's
>> not a valid operand for normal memory constraint.
>>Bootstrap is ok, regression test is ok for make check
>> RUNTESTFLAGS="--target_board='unix{-m32,}'"
>>
>> gcc/ChangeLog
>>  PR target/97540
>>  * ira.c: (ira_setup_alts): Extract memory from operand only
>>  for special memory constraint.
>>  * recog.c (asm_operand_ok): Ditto.
>>  * lra-constraints.c (process_alt_operands): MEM_P is
>>  required for normal memory constraint.
>>
>> gcc/testsuite/ChangeLog
>>  * gcc.target/i386/pr97540.c: New test.
>>
> I understand Richard's concerns and actually these concerns were my 
> motivations to constraint possible cases for extract_mem_from_operand in 
> the original patch introducing the function.
>
> If Richard proposes a better solution we will reconsider the current 
> approach and revert the changes if it is necessary.
>
> Meanwhile I am approving this patch.  I hope it will not demotivate 
> Richard's attempt to find a better solution.

OK, that's fine with me.  I might come back to this next stage 1,
depending on how things turn out.

Richard


Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3

2020-11-03 Thread Richard Sandiford via Gcc-patches
xiezhiheng  writes:
>> -Original Message-
>> From: Richard Sandiford [mailto:richard.sandif...@arm.com]
>> Sent: Friday, October 30, 2020 6:24 PM
>> To: xiezhiheng 
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng  writes:
>> >> -Original Message-
>> >> From: Richard Sandiford [mailto:richard.sandif...@arm.com]
>> >> Sent: Monday, October 26, 2020 9:03 PM
>> >> To: xiezhiheng 
>> >> Cc: Richard Biener ;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> Thanks, pushed to trunk.
>> >>
>> >
>> > Thanks, and I made the patch for float conversion intrinsics.
>> 
>> LGTM, thanks.  Pushed.
>> 
>
> Thanks.  And I made two separate patches for these two groups, compare 
> intrinsics
> and encryption algorithm (AES/SHA/SM3/SM4) intrinsics.
>
> Note: It does not matter which patch is applied first.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, I pushed both patches to trunk.

Richard


Re: libcpp: dependency emission tidying

2020-11-03 Thread Nathan Sidwell

Whoops, that broke bootstrap.  Pushing this fix.


libcpp/
* init.c (cpp_read_main_file): Use cpp_get_deps result.




--
Nathan Sidwell
diff --git c/libcpp/init.c w/libcpp/init.c
index 5b2607e3767..6c52f50de39 100644
--- c/libcpp/init.c
+++ w/libcpp/init.c
@@ -669,7 +669,7 @@ cpp_read_main_file (cpp_reader *pfile, const char *fname, bool injecting)
 {
   if (mkdeps *deps = cpp_get_deps (pfile))
 /* Set the default target (if there is none already).  */
-deps_add_default_target (pfile->deps, fname);
+deps_add_default_target (deps, fname);
 
   pfile->main_file
 = _cpp_find_file (pfile, fname, &pfile->no_search_path, /*angle=*/0,


Re: [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector

2020-11-03 Thread Richard Sandiford via Gcc-patches
Dennis Zhang  writes:
> Hi Richard,
>
> On 10/30/20 2:07 PM, Richard Sandiford wrote:
>> Dennis Zhang  writes:
>>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
>>> b/gcc/config/aarch64/aarch64-simd-builtins.def
>>> index 332a0b6b1ea..39ebb776d1d 100644
>>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>>> @@ -719,6 +719,9 @@
>>> VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
>>> VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
>>>   
>>> +  /* Implemented by aarch64_vget_halfv8bf.  */
>>> +  VAR1 (GETREG, vget_half, 0, ALL, v8bf)
>> 
>> This should be AUTO_FP, since it doesn't have any side-effects.
>> (As before, we should probably rename the flag, but that's separate work.)
>> 
>>> +
>>> /* Implemented by aarch64_simd_mmlav16qi.  */
>>> VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
>>> VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
>>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>>> b/gcc/config/aarch64/aarch64-simd.md
>>> index 9f0e2bd1e6f..f62c52ca327 100644
>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>> @@ -7159,6 +7159,19 @@
>>> [(set_attr "type" "neon_dot")]
>>>   )
>>>   
>>> +;; vget_low/high_bf16
>>> +(define_expand "aarch64_vget_halfv8bf"
>>> +  [(match_operand:V4BF 0 "register_operand")
>>> +   (match_operand:V8BF 1 "register_operand")
>>> +   (match_operand:SI 2 "aarch64_zero_or_1")]
>>> +  "TARGET_BF16_SIMD"
>>> +{
>>> +  int hbase = INTVAL (operands[2]);
>>> +  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);
>> 
>> I think this needs to be:
>> 
>>aarch64_simd_vect_par_cnst_half
>> 
>> instead.  The issue is that on big-endian targets, GCC assumes vector
>> lane 0 is in the high part of the register, whereas for AArch64 it's
>> always in the low part of the register.  So we convert from AArch64
>> numbering to GCC numbering when generating the rtx and then take
>> endianness into account when matching the rtx later.
>> 
>> It would be good to have -mbig-endian tests that make sure we generate
>> the right instruction for each function (i.e. we get them the right way
>> round).  I guess it would be good to test that for little-endian too.
>> 
>
> I've updated the expander using aarch64_simd_vect_par_cnst_half.
> And the expander is divided into two for getting low and high half 
> seperately.
> It's tested for aarch64-none-linux-gnu and aarch64_be-none-linux-gnu 
> targets with new tests including -mbig-endian option.
>
>>> +  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
>>> +  DONE;
>>> +})
>>> +
>>>   ;; bfmmla
>>>   (define_insn "aarch64_bfmmlaqv4sf"
>>> [(set (match_operand:V4SF 0 "register_operand" "=w")
>>> diff --git a/gcc/config/aarch64/predicates.md 
>>> b/gcc/config/aarch64/predicates.md
>>> index 215fcec5955..0c8bc2b0c73 100644
>>> --- a/gcc/config/aarch64/predicates.md
>>> +++ b/gcc/config/aarch64/predicates.md
>>> @@ -84,6 +84,10 @@
>>>  (ior (match_test "op == constm1_rtx")
>>>   (match_test "op == const1_rtx"))
>>>   
>>> +(define_predicate "aarch64_zero_or_1"
>>> +  (and (match_code "const_int")
>>> +   (match_test "op == const0_rtx || op == const1_rtx")))
>> 
>> zero_or_1 looked odd to me, feels like it should be 0_or_1 or zero_or_one.
>> But I see that it's for consistency with aarch64_reg_zero_or_m1_or_1,
>> so let's keep it as-is.
>> 
>
> This predicate is removed since there is no need of the imm operand in 
> the new expanders.
>
> Thanks for the reviews.
> Is it OK for trunk now?

Looks good.  OK for trunk and branches, thanks.

Richard


[PATCH] middle-end/97579 - fix VEC_COND_EXPR ISEL optab query

2020-11-03 Thread Richard Biener
This fixes a mistake in the optab query done by ISEL.  It
doesn't fix the PR but shifts the ICE elsewhere.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

2020-11-03  Richard Biener  

PR middle-end/97579
* gimple-isel.cc (gimple_expand_vec_cond_expr): Use
the correct types for the vcond_mask/vec_cmp optab queries.
---
 gcc/gimple-isel.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b64e31fc6fe..9186ff55cdd 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
@@ -162,11 +162,12 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi,
  op0a = gimple_assign_rhs1 (def_stmt);
  op0b = gimple_assign_rhs2 (def_stmt);
 
+ tree op0_type = TREE_TYPE (op0);
  tree op0a_type = TREE_TYPE (op0a);
  if (used_vec_cond_exprs >= 2
- && (get_vcond_mask_icode (mode, TYPE_MODE (op0a_type))
+ && (get_vcond_mask_icode (mode, TYPE_MODE (op0_type))
  != CODE_FOR_nothing)
- && expand_vec_cmp_expr_p (op0a_type, TREE_TYPE (lhs), tcode))
+ && expand_vec_cmp_expr_p (op0a_type, op0_type, tcode))
{
  /* Keep the SSA name and use vcond_mask.  */
  tcode = TREE_CODE (op0);
-- 
2.26.2


[PATCH] tree-optimization/97623 - limit PRE hoist insertion

2020-11-03 Thread Richard Biener
This limits insert iteration caused by PRE insertions generating
hoist insertion opportunities and vice versa.  The patch limits
the hoist insertion iterations to three by default.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

2020-11-03  Richard Biener  

PR tree-optimization/97623
* params.opt (-param=max-pre-hoist-insert-iterations): New.
* doc/invoke.texi (max-pre-hoist-insert-iterations): Document.
* tree-ssa-pre.c (insert): Do at most max-pre-hoist-insert-iterations
hoist insert iterations.
---
 gcc/doc/invoke.texi | 5 +
 gcc/params.opt  | 4 
 gcc/tree-ssa-pre.c  | 7 +--
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 89168be1d2f..5320e6c1e1e 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13408,6 +13408,11 @@ is aborted and the load or store is not considered 
redundant.  The
 number of queries is algorithmically limited to the number of
 stores on all paths from the load to the function entry.
 
+@item max-pre-hoist-insert-iterations
+The maximum number of iterations doing insertion during code
+hoisting which is done as part of the partial redundancy elimination
+insertion phase.
+
 @item ira-max-loops-num
 IRA uses regional register allocation by default.  If a function
 contains more loops than the number given by this parameter, only at most
diff --git a/gcc/params.opt b/gcc/params.opt
index 7bac39a9d58..a33a371a395 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -597,6 +597,10 @@ Maximum depth of sqrt chains to use when synthesizing 
exponentiation by a real c
 Common Joined UInteger Var(param_max_predicted_iterations) Init(100) 
IntegerRange(0, 65536) Param Optimization
 The maximum number of loop iterations we predict statically.
 
+-param=max-pre-hoist-insert-iterations=
+Common Joined UInteger Var(param_max_pre_hoist_insert_iterations) Init(3) 
Param Optimization
+The maximum number of insert iterations done for PRE code hoisting.
+
 -param=max-reload-search-insns=
 Common Joined UInteger Var(param_max_reload_search_insns) Init(100) Param 
Optimization
 The maximum number of instructions to search backward when looking for 
equivalent reload.
diff --git a/gcc/tree-ssa-pre.c b/gcc/tree-ssa-pre.c
index 091ecb39bb6..39c52c9b0f0 100644
--- a/gcc/tree-ssa-pre.c
+++ b/gcc/tree-ssa-pre.c
@@ -3647,8 +3647,11 @@ insert (void)
 
   changed = false;
   /* Insert expressions for hoisting.  Do a backward walk here since
-inserting into BLOCK exposes new opportunities in its predecessors.  */
-  if (flag_code_hoisting)
+inserting into BLOCK exposes new opportunities in its predecessors.
+Since PRE and hoist insertions can cause back-to-back iteration
+limit that on the hoist side.  */
+  if (flag_code_hoisting
+ && num_iterations <= param_max_pre_hoist_insert_iterations)
for (int idx = rpo_num - 1; idx >= 0; --idx)
  {
basic_block block = BASIC_BLOCK_FOR_FN (cfun, rpo[idx]);
-- 
2.26.2


Re: PowerPC: Add __float128 conversions to/from Decimal

2020-11-03 Thread Segher Boessenkool
On Tue, Nov 03, 2020 at 01:12:29AM +, Joseph Myers wrote:
> On Mon, 2 Nov 2020, Segher Boessenkool wrote:
> 
> > > Also note that if you want to use printf as opposed to strfromf128 for 
> > > IEEE binary128 you'll need to use __printfieee128 (the version that 
> > > expects long double to be IEEE binary128) which was introduced in glibc 
> > > 2.32, so that doesn't help with the glibc version dependencies.
> > 
> > libiberty has printf functions of its own, I was wondering if those work
> > fine; if they do, that would solve all problems here.
> 
> I don't see any meaningful kind of printf implementation in libiberty.  
> There are implementations of various printf functions in terms of other 
> printf functions (including vprintf in terms of vfprintf in terms of 
> _doprnt in terms of fprintf), but nothing that actually does the main work 
> of converting a floating-point value to a string without calling out to 
> some libc printf function.

I missed that "in terms of fprintf" step :-(

Well, rats.  Thanks for telling me about my stupidity!  :-)


Segher


[committed] Cleanup of a merge mistake in fold-const.c

2020-11-03 Thread Bernd Edlinger
Hi,


this removes a duplicated statement, in fold-const.c in function getbyterep:

The comment, "Ideally this would turn into a gcc_checking_assert over time."
and the following if-statement are duplicated so one of them can be removed:

  if (init_bytes > array_size)
init_bytes = array_size;


This happened due to a merge conflict a long time ago.

Bootstrapped and regtested on x86_64-pc-linux-gnu.

I think this qualifies as obvious, so I will commit it now.


Thanks
Bernd.
From 48a85b06992e4d915f29998f8db96ec2a019ea16 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger 
Date: Tue, 3 Nov 2020 14:20:14 +0100
Subject: [PATCH] Cleanup of a merge mistake in fold-const.c

This removes a duplicated statement.
It was apparently introduced due to a merge mistake.

2020-11-03  Bernd Edlinger  

	* fold-const.c (getbyterep): Remove duplicated statement.
---
 gcc/fold-const.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index ebd32bb..c47557d 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -15565,11 +15565,6 @@ getbyterep (tree src, unsigned HOST_WIDE_INT *strsize)
  is equal to strlen (A) + 1.  */
   const unsigned HOST_WIDE_INT array_size = tree_to_uhwi (mem_size);
   unsigned HOST_WIDE_INT init_bytes = TREE_STRING_LENGTH (src);
-
-  /* Ideally this would turn into a gcc_checking_assert over time.  */
-  if (init_bytes > array_size)
-init_bytes = array_size;
-
   const char *string = TREE_STRING_POINTER (src);
 
   /* Ideally this would turn into a gcc_checking_assert over time.  */
-- 
1.9.1



RE: [PATCH] SLP: Move load/store-lanes check till late

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi Richi,

We decided to take the regression in any code-gen this could
give and fix it properly next stage-1.  As such here's a new
patch based on your previous feedback.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-slp.c (vect_analyze_slp_instance): Moved load/store lanes
check to ...
* tree-vect-loop.c (vect_analyze_loop_2): ..Here

gcc/testsuite/ChangeLog:

* gcc.dg/vect/slp-11b.c: Update output scan.
* gcc.dg/vect/slp-perm-6.c: Likewise.

> -Original Message-
> From: rguent...@c653.arch.suse.de  On
> Behalf Of Richard Biener
> Sent: Thursday, October 22, 2020 9:44 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; o...@ucw.cz
> Subject: Re: [PATCH] SLP: Move load/store-lanes check till late
> 
> On Wed, 21 Oct 2020, Tamar Christina wrote:
> 
> > Hi All,
> >
> > This moves the code that checks for load/store lanes further in the
> > pipeline and places it after slp_optimize.  This would allow us to
> > perform optimizations on the SLP tree and only bail out if we really have a
> permute.
> >
> > With this change it allows us to handle permutes such as {1,1,1,1}
> > which should be handled by a load and replicate.
> >
> > This change however makes it all or nothing. Either all instances can
> > be handled or none at all.  This is why some of the test cases have been
> adjusted.
> 
> So this possibly leaves a loop unvectorized in case there's a ldN/stN
> opportunity but another SLP instance with a permutation not handled by
> interleaving is present.  What I was originally suggesting is to only cancel 
> the
> SLP build if _all_ instances can be handled with ldN/stN.
> 
> Of course I'm also happy with completely removing this heuristics.
> 
> Note some of the comments look off now, also the assignment to ok before
> the goto is pointless and you should probably turn this into a dump print
> instead.
> 
> Thanks,
> Richard.
> 
> > Bootstrapped Regtested on aarch64-none-linux-gnu, -x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> 
> 
> 
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * tree-vect-slp.c (vect_analyze_slp_instance): Moved load/store
> lanes
> > check to ...
> > * tree-vect-loop.c (vect_analyze_loop_2): ..Here
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.dg/vect/slp-11b.c: Update output scan.
> > * gcc.dg/vect/slp-perm-6.c: Likewise.
> >
> >
> 
> --
> Richard Biener 
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg, Germany; GF: Felix Imend
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11b.c b/gcc/testsuite/gcc.dg/vect/slp-11b.c
index 0cc23770badf0e00ef98769a2dd14a92dca32cca..fe5bb0c3ce7682c7cef1313e342d95aba3fe11b2 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-11b.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11b.c
@@ -45,4 +45,4 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided4 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided4 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "re-trying with SLP disabled" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
index 38489291a2659c989121d44c9e9e7bdfaa12f868..07bf8916de7ce88bbb1d65437f8bf6d8ab17efe6 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
@@ -106,7 +106,7 @@ int main (int argc, const char* argv[])
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int && { {! vect_load_lanes } && {! vect_partial_vectors_usage_1 } } } } } } */
 /* The epilogues are vectorized using partial vectors.  */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { vect_perm3_int && { {! vect_load_lanes } && vect_partial_vectors_usage_1 } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
 /* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
 /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
 /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 6fa185daa2836062814f9c9a6659011a3153c6a2..56873b93ef9905ff76929f471de4d32559268304 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2365,6 +2365,78 @@ start_over:
    "unsupported SLP instances\n");
 	  goto again;
 	}
+
+  /* Check whether any load in ALL SLP instances is possibly permuted.  */
+  slp_tree load_node, slp_root;
+  unsigned i, x;
+   

RE: [PATCH v2 6/16]middle-end Add Complex Addition with rotation detection

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi All,

here is a respin with the requested changes.

I just realized I haven't updated the documentation yet but will do
so soon since I'm sure there will be feedback :)

Thanks,
Tamar

gcc/ChangeLog:

* doc/md.texi: Document optabs.
* internal-fn.def (COMPLEX_ADD_ROT90, COMPLEX_ADD_ROT270): New.
* optabs.def (cadd90_optab, cadd270_optab): New.
* tree-vect-slp-patterns.c (linear_loads_p, vect_slp_make_linear,
class complex_add_pattern,complex_add_pattern::matches): New.
(complex_operations_pattern::matches): Add complex_add_pattern.

> -Original Message-
> From: rguent...@c653.arch.suse.de  On
> Behalf Of Richard Biener
> Sent: Tuesday, September 29, 2020 11:44 AM
> To: Richard Sandiford 
> Cc: Tamar Christina ; gcc-patches@gcc.gnu.org;
> nd ; o...@ucw.cz
> Subject: Re: [PATCH v2 6/16]middle-end Add Complex Addition with rotation
> detection
> 
> On Tue, 29 Sep 2020, Richard Sandiford wrote:
> 
> > Tamar Christina  writes:
> > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > >
> 2b46286943778e16d95b15def4299bcbf8db7eb8..71e226505b2619d10982b59a
> 4e
> > > bbed73a70f29be 100644
> > > --- a/gcc/doc/md.texi
> > > +++ b/gcc/doc/md.texi
> > > @@ -6132,6 +6132,17 @@ floating-point mode.
> > >
> > >  This pattern is not allowed to @code{FAIL}.
> > >
> > > +@cindex @code{cadd@var{m}@var{n}3} instruction pattern @item
> > > +@samp{cadd@var{m}@var{n}3} Perform a vector addition of complex
> > > +numbers in operand 1 with operand 2 rotated by @var{m} degrees
> > > +around the argand plane and storing the result in operand 0.  The
> > > +instruction must perform the operation on data loaded contiguously
> > > +into the vectors.
> >
> > Nitpicking, sorry, but I think it would be better to describe the
> > layout directly rather than in terms of loads, since the preceding
> > operation might not be a load.
> 
> So if we're at that and since GCC vectors do not have complex components
> can we formulate this in terms avoiding 'complex'?
> Isn't this an add of one vector to a vector with adjacant lanes swapped and
> possibly negated?  Mentioning that this would match a complex add in case
> lanes happen to match up with complex real/imag parts is OK but the pattern
> should work equally well if there's no complex numbers involved?
> 
> > I guess the main question is: what representation do we expect for
> > big-endian?  A normal Advanced SIMD LDR would give this (for floats):
> >
> >  MEMORY
> >+-+-+-+-+
> >| r0  | i0  | r1  | i1  |
> >+-+-+-+-+
> >|  0  |  1  |  2  |  3  |   array numbering
> >+-+-+-+-+
> >   V V V V  Advanced SIMD LDR
> >+-+-+-+-+
> >| r0  | i0  | r1  | i1  |
> >+-+-+-+-+
> >|  0  |  1  |  2  |  3  |   GCC lane numbering
> >+-+-+-+-+
> >|  3  |  2  |  1  |  0  |   Arm lane numbering
> >+-+-+-+-+
> >   MSB   REGISTER  LSB
> >
> > but the FC* instructions put the imaginary parts in the more
> > significant lane, so the pairs of elements above would need to be
> > reversed:
> >
> >  MEMORY
> >+-+-+-+-+
> >| r0  | i0  | r1  | i1  |
> >+-+-+-+-+
> >|  0  |  1  |  2  |  3  |   array numbering
> >+-+-+-+-+
> >\   /   \   /
> > \ / \ /
> >  X   X Load and permute
> > / \ / \
> >/   \   /   \
> >+-+-+-+-+
> >| i0  | r0  | i1  | r1  |
> >+-+-+-+-+
> >|  0  |  1  |  2  |  3  |   GCC lane numbering
> >+-+-+-+-+
> >|  3  |  2  |  1  |  0  |   Arm lane numbering
> >+-+-+-+-+
> >   MSB   REGISTER  LSB
> >
> > (Or the whole vector could be reversed.)
> >
> > We might decide that it just isn't worth doing this for Advanced SIMD.
> > But should the semantics of the optab be that:
> >
> > (1) GCC lane number 0 holds a real part, or
> > (2) the least significant lane holds a real part?
> >
> > With (1), it would be up to the target to hide the permute above.
> > With (2), the vectoriser would need to introduce the permute itself.
> >
> > I'm not sure there's a perfect answer even for Arm targets.  (2)
> > matches the Advanced SIMD semantics.  But for SVE, the register layout
> > follows
> > LD1 rather than LDR, and the GCC and architectural lane numbering match
> up.
> > (1) would therefore be better than (2) for SVE (and so no permute
> > would be needed for either endianness on SVE).
> >
> > > +The operation is only supported for vector modes @var{n} and with
> > > +rotations @var{m} of 90 or 270.
> > > +
> > > +This pattern is not allowed to @code{FAIL}.
> > > +
> > >  @cindex @code{ffs@var{m}2} instruction pattern  @item
> > > @samp{ffs@var{m}2}  Store into operand 0 one plus the index of the
> > > lea

RE: [PATCH v2 4/16]middle-end: Add dissolve code for when SLP fails and non-SLP loop vectorization is to be tried.

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi All,

Here's a respin of this patch with the requested changes.

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-loop.c (vect_dissolve_slp_only_patterns): New
(vect_dissolve_slp_only_groups): Call vect_dissolve_slp_only_patterns.

> -Original Message-
> From: Gcc-patches  On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:28 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; rguent...@suse.de; o...@ucw.cz
> Subject: [PATCH v2 4/16]middle-end: Add dissolve code for when SLP fails
> and non-SLP loop vectorization is to be tried.
> 
> Hi All,
> 
> This adds the dissolve code to undo the patterns created by the pattern
> matcher in case SLP is to be aborted.
> 
> As mentioned in the cover letter this has one issue in that the number of
> copies can needed can change depending on whether TWO_OPERATORS is
> needed or not.
> 
> Because of this I don't analyze the original statement when it's replaced by a
> pattern and attempt to correct it here by analyzing it after dissolve.
> 
> This however seems too late and I would need to change the unroll factor,
> which seems a bit odd.  Any advice would be appreciated.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * tree-vect-loop.c (vect_dissolve_slp_only_patterns): New
>   (vect_dissolve_slp_only_groups): Call
> vect_dissolve_slp_only_patterns.
> 
> --
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 6fa185daa2836062814f9c9a6659011a3153c6a2..9601a83edcb05e994e27d4bb16a537190ad8471d 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1979,6 +1979,63 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
   return opt_result::success ();
 }
 
+/* For every SLP only pattern created by the pattern matched rooted in ROOT
+   restore the relevancy of the original statements over those of the pattern
+   and destroy the pattern relationship.  This restores the SLP tree to a state
+   where it can be used when SLP build is cancelled or re-tried.  */
+
+static void
+vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo,
+ hash_set *visited, slp_tree root)
+{
+  if (!root || visited->contains (root))
+return;
+
+  unsigned int i;
+  slp_tree node;
+  stmt_vec_info related_stmt_info;
+  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (root);
+
+  visited->add (root);
+
+if (stmt_info && STMT_VINFO_SLP_VECT_ONLY (stmt_info)
+	 && (related_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info)) != NULL)
+  {
+	if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "dissolving relevancy of %G",
+			   STMT_VINFO_STMT (stmt_info));
+	STMT_VINFO_RELEVANT (stmt_info) = vect_unused_in_scope;
+	STMT_VINFO_RELEVANT (related_stmt_info) = vect_used_in_scope;
+	STMT_VINFO_IN_PATTERN_P (related_stmt_info) = false;
+	STMT_SLP_TYPE (related_stmt_info) = loop_vect;
+  }
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
+vect_dissolve_slp_only_patterns (loop_vinfo, visited, node);
+}
+
+/* Lookup any SLP Only Pattern statements created by the SLP pattern matcher in
+   all slp_instances in LOOP_VINFO and undo the relevancy of statements such
+   that the original SLP tree before the pattern matching is used.  */
+
+static void
+vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo)
+{
+
+  unsigned int i;
+  hash_set visited;
+
+  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_patterns");
+
+  /* Unmark any SLP only patterns as relevant and restore the STMT_INFO of the
+ related instruction.  */
+  slp_instance instance;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
+vect_dissolve_slp_only_patterns (loop_vinfo, &visited,
+ SLP_INSTANCE_TREE (instance));
+}
+
 /* Look for SLP-only access groups and turn each individual access into its own
group.  */
 static void
@@ -2510,6 +2567,9 @@ again:
   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
   gcc_assert (!ok);
 
+  /* Dissolve any SLP patterns created by the SLP pattern matcher.  */
+  vect_dissolve_slp_only_patterns (loop_vinfo);
+
   /* Try again with SLP forced off but if we didn't do any SLP there is
  no point in re-trying.  */
   if (!slp)



RE: [PATCH v2 7/16]middle-end: Add Complex Multiplication and Multiplication with Conjucate detection

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi All,

This is a respin of this patch using the new approach.

Thanks,
Tamar

gcc/ChangeLog:

* doc/md.texi: Document optabs.
* internal-fn.def (COMPLEX_MUL, COMPLEX_MUL_CONJ): New.
* optabs.def (cmul_optab, cmul_conj_optab): New,
* tree-vect-slp-patterns.c (vect_build_perm_groups,
(vect_can_combine_node_p, vect_slp_make_combine_linear,
vect_match_call_complex_mla, vect_slp_matches_complex_mul,
class complex_mul_pattern, complex_mul_pattern::matches,
complex_mul_pattern::validate_p,
complex_operations_pattern::matches): Add complex_mul_pattern.


> -Original Message-
> From: Gcc-patches  On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:29 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; rguent...@suse.de; o...@ucw.cz
> Subject: [PATCH v2 7/16]middle-end: Add Complex Multiplication and
> Multiplication with Conjucate detection
> 
> Hi All,
> 
> This patch adds pattern detections for the following operation:
> 
>   Complex multiplication and Conjucate Complex multiplication of the second
>  parameter.
> 
> c = a * b and c = a * conj (b)
> 
>   For the conjucate cases it supports under fast-math that the operands that
> is
>   being conjucated be flipped by flipping the arguments to the optab.  This
>   allows it to support c = conj (a) * b and c += conj (a) * b.
> 
>   where a, b and c are complex numbers.
> 
> and provides a shared class for anything needing to recognize complex MLA
> patterns.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * doc/md.texi: Document optabs.
>   * internal-fn.def (COMPLEX_MUL, COMPLEX_MUL_CONJ): New.
>   * optabs.def (cmul_optab, cmul_conj_optab): New,
>   * tree-vect-slp-patterns.c (class ComplexMLAPattern,
>   class ComplexMulPattern): New.
>   (slp_patterns): Add ComplexMulPattern.
> 
> --
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 71e226505b2619d10982b59a4ebbed73a70f29be..ddaf1abaccbd44dae11ea902ec38b474aacfb8e1 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6143,6 +6143,28 @@ rotations @var{m} of 90 or 270.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmul@var{m}4} instruction pattern
+@item @samp{cmul@var{m}4}
+Perform a vector floating point multiplication of complex numbers in operand 0
+and operand 1.
+
+The instruction must perform the operation on data loaded contiguously into the
+vectors.
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmul_conj@var{m}4} instruction pattern
+@item @samp{cmul_conj@var{m}4}
+Perform a vector floating point multiplication of complex numbers in operand 0
+and the conjucate of operand 1.
+
+The instruction must perform the operation on data loaded contiguously into the
+vectors.
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{ffs@var{m}2} instruction pattern
 @item @samp{ffs@var{m}2}
 Store into operand 0 one plus the index of the least significant 1-bit
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 33c54be1e158ddea25c4cd6b1148df8cf4a509b5..cb41643f5e332518a0271bb8e1af4883c8bd6880 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -279,6 +279,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
 DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
 
 
 /* FP scales.  */
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 2bb0bf857977035bf562a77f5f6848e80edf936d..9c267d422478d0011f288b1f5f62daabe3989ba7 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -292,6 +292,8 @@ OPTAB_D (copysign_optab, "copysign$F$a3")
 OPTAB_D (xorsign_optab, "xorsign$F$a3")
 OPTAB_D (cadd90_optab, "cadd90$a3")
 OPTAB_D (cadd270_optab, "cadd270$a3")
+OPTAB_D (cmul_optab, "cmul$a3")
+OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (cosh_optab, "cosh$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 0732cf0a6d93be8590b84c39dff82940b280e46b..2edb0117f9cbbfc40e9ed3a96120a3c88f84a68e 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -196,6 +196,65 @@ linear_loads_p (slp_tree root, bool *linear)
   return loads;
 }
 
+/* Builds a permutation group from the operands in OPS and stores it in BLOCKS.
+   The group describes how to combine the operators to get a valid linear node.
+
+   This is used when combining multiple children from a two_operators node into
+   one using a lane permute to select the appropriate lane. As an e

RE: [PATCH v2 2/16]middle-end: Refactor and expose some vectorizer helper functions.

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi All,

This patch is a respin of the previous one defining a new helper
function add_pattern_stmt.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-patterns.c (vect_mark_pattern_stmts): Remove static inline.
* tree-vect-slp.c (vect_create_new_slp_node): Remove static and only
set smts if valid.
* tree-vectorizer.c (vec_info::add_pattern_stmt): New.
(vec_info::set_vinfo_for_stmt): Optionally enforce read-only.
* tree-vectorizer.h (struct _slp_tree): Use new types.
(lane_permutation_t, lane_permutation_t): New.
(vect_create_new_slp_node, vect_mark_pattern_stmts): New.

> -Original Message-
> From: Gcc-patches  On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:28 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; rguent...@suse.de; o...@ucw.cz
> Subject: [PATCH v2 2/16]middle-end: Refactor and expose some vectorizer
> helper functions.
> 
> Hi All,
> 
> This is a small refactoring which exposes some helper functions in the
> vectorizer so they can be used in other places.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * tree-vect-patterns.c (vect_mark_pattern_stmts): Remove static.
>   * tree-vect-slp.c (vect_free_slp_tree,
>   vect_build_slp_tree): Remove static.
>   (struct bst_traits, bst_traits::hash, bst_traits::equal): Move...
>   * tree-vectorizer.h (struct bst_traits, bst_traits::hash,
>   bst_traits::equal): ... to here.
>   (vect_mark_pattern_stmts, vect_free_slp_tree,
>   vect_build_slp_tree): Declare.
> 
> --
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index ac56acebe016058cbbc9599cef348ec4211c19d6..32b272ac443cac6bbaf2695c81078a9c8c2a656d 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -5216,7 +5216,7 @@ const unsigned int NUM_PATTERNS = ARRAY_SIZE (vect_vect_recog_func_ptrs);
 
 /* Mark statements that are involved in a pattern.  */
 
-static inline void
+void
 vect_mark_pattern_stmts (vec_info *vinfo,
 			 stmt_vec_info orig_stmt_info, gimple *pattern_stmt,
  tree pattern_vectype)
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index e97fbe897a76008d50ee94c3b1b009344cc37d4a..30036ec84c74a0e428cc661eacf565224047f9e0 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -148,15 +148,18 @@ vect_free_slp_instance (slp_instance instance)
 
 /* Create an SLP node for SCALAR_STMTS.  */
 
-static slp_tree
+slp_tree
 vect_create_new_slp_node (slp_tree node,
 			  vec scalar_stmts, unsigned nops)
 {
   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
   SLP_TREE_CHILDREN (node).create (nops);
   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
-  SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
-  SLP_TREE_LANES (node) = scalar_stmts.length ();
+  if (scalar_stmts.exists ())
+{
+  SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
+  SLP_TREE_LANES (node) = scalar_stmts.length ();
+}
   return node;
 }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index fbf5291cf065f3944040937db92d3997acd45f23..4bd454cfb185d7036843fc7140b073f525b2ec6a 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -115,6 +115,8 @@ typedef hash_map > lane_permutation_t;
+typedef vec load_permutation_t;
 
 extern object_allocator<_slp_tree> *slp_tree_pool;
 
@@ -137,11 +139,11 @@ struct _slp_tree {
 
   /* Load permutation relative to the stores, NULL if there is no
  permutation.  */
-  vec load_permutation;
+  load_permutation_t load_permutation;
   /* Lane permutation of the operands scalar lanes encoded as pairs
  of { operand number, lane number }.  The number of elements
  denotes the number of output lanes.  */
-  vec > lane_permutation;
+  lane_permutation_t lane_permutation;
 
   tree vectype;
   /* Vectorized stmt/s.  */
@@ -348,6 +350,7 @@ public:
   ~vec_info ();
 
   stmt_vec_info add_stmt (gimple *);
+  stmt_vec_info add_pattern_stmt (gimple *, stmt_vec_info);
   stmt_vec_info lookup_stmt (gimple *);
   stmt_vec_info lookup_def (tree);
   stmt_vec_info lookup_single_use (tree);
@@ -393,7 +396,7 @@ public:
 
 private:
   stmt_vec_info new_stmt_vec_info (gimple *stmt);
-  void set_vinfo_for_stmt (gimple *, stmt_vec_info);
+  void set_vinfo_for_stmt (gimple *, stmt_vec_info, bool = true);
   void free_stmt_vec_infos ();
   void free_stmt_vec_info (stmt_vec_info);
 };
@@ -1975,8 +1978,12 @@ extern void duplicate_and_interleave (vec_info *, gimple_seq *, tree,
   vec, unsigned int, vec &);
 extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info);
 extern bool vect_update_shared_vectype (stmt_vec_info, tree);
+extern slp_tree vect_create_new_slp_node (vec, unsigned);
 
 /* In tree-vect-patterns.c.  */
+extern void
+vect_mark_pattern_stmts (vec_info *, stmt_vec_info, gimple *, tree);
+
 /* Pattern recognition functions.
Additional pattern recogniti

RE: [PATCH v2 5/16]middle-end: Add shared machinery for matching patterns involving complex numbers.

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi All,

This is a respin containing the requested changes.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-slp-patterns.c (vect_match_expression_p,
vect_check_lane_permute, vect_detect_pair_op,
vect_mark_stmts_as_in_pattern, class complex_pattern,
complex_pattern::validate_p, class complex_operations_pattern,
complex_operations_pattern::matches,
complex_operations_pattern::get_name,
complex_operations_pattern::build,
complex_operations_pattern::validate_p,
complex_operations_pattern::get_arity,
complex_operations_pattern::is_optab_supported_p,
complex_operations_pattern::get_ifn): New.
(slp_patterns): Add complex_operations_pattern.

> -Original Message-
> From: Gcc-patches  On Behalf Of Tamar
> Christina
> Sent: Monday, September 28, 2020 5:06 PM
> To: Richard Biener 
> Cc: nd ; gcc-patches@gcc.gnu.org; o...@ucw.cz
> Subject: RE: [PATCH v2 5/16]middle-end: Add shared machinery for matching
> patterns involving complex numbers.
> 
> Hi Richi,
> 
> > -Original Message-
> > From: rguent...@c653.arch.suse.de  On
> > Behalf Of Richard Biener
> > Sent: Monday, September 28, 2020 2:22 PM
> > To: Tamar Christina 
> > Cc: gcc-patches@gcc.gnu.org; nd ; o...@ucw.cz
> > Subject: Re: [PATCH v2 5/16]middle-end: Add shared machinery for
> > matching patterns involving complex numbers.
> >
> > On Fri, 25 Sep 2020, Tamar Christina wrote:
> >
> > > Hi All,
> > >
> > > This patch adds shared machinery for detecting patterns having to do
> > > with complex number operations.  The class ComplexPattern provides
> > > helpers for matching and ultimately undoing the permutation in the
> > > tree by rebuilding the graph.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> >
> > I think you want to change all this to not look at individual
> > stmts:
> >
> > +vect_match_expression_p (slp_tree node, tree_code code, int base,
> > + int
> > idx,
> > +stmt_vec_info *op1, stmt_vec_info *op2)
> > +{
> > +
> > +  vec scalar_stmts = SLP_TREE_SCALAR_STMTS (node);
> > +
> >
> > _all_ lanes are supposed to match the operation in
> > SLP_TREE_REPRESENTATIVE there's no need to do any operation-matching
> > on lane granularity.
> >
> 
> That's fair, that flexibility seems like it indeed won't work since the
> statements are vectorized based on SLP_TREE_REPRESENTATIVE anyway. So
> I'll simplify it.
> 
> > The only thing making a difference here is VEC_PERM_EXPR nodes where
> > again - there's no need to look at (eventually non-existant!)
> > SLP_TREE_SCALAR_STMTS but its SLP_TREE_REPRESENTATIVE.
> >
> > +  vec children = SLP_TREE_CHILDREN (node);
> > +
> > +  /* If it's a VEC_PERM_EXPR we need to look one deeper.
> > VEC_PERM_EXPR
> > +only have one entry.  So pick on.  */
> > +  if (node->code == VEC_PERM_EXPR)
> > +   children = SLP_TREE_CHILDREN (children.last ());
> >
> > that's too simplistic ;)
> >
> > + *op1 = SLP_TREE_SCALAR_STMTS (children[0])[n];
> >
> > please make op1,op2 slp_tree, not stmt_vec_info.
> >
> > Where do you look at SLP_TREE_LANE_PERMUTATION at all?  I think the
> > result of
> >
> 
> Here I have to admit that I have/am a bit confused as to the relation
> between the different permute fields.
> LOAD permute is quite straight forward, LANE permute I think are
> shuffles/explicit permutes.
> 
> But then I am lost as to the purpose of a VEC_PERM_EXPR nodes. Is it just a
> marker to indicate that some node below has a load permute somewhere?
> 
> > +vect_detect_pair_op (int base, slp_tree node1, int offset1,
> > + slp_tree
> > node2,
> > +int offset2, vec *ops)
> >
> > could be simply the SLP_TREE_LANE_PERMUTATION? plus its two child
> > nodes?
> 
> Right, if I understood correctly, on the two_operands case the lane permute
> would tell me whether it's + or -, and in the case of the non- two_operands
> cases I probably want to check that it's vNULL since any permute in the order
> changes how the instruction accepts the inputs?
> 
> >
> > In the ComplexAddPattern patch I see
> >
> > +  /* Correct the arguments after matching.  */
> > +  std::swap (this->m_vects[1], this->m_vects[3]);
> >
> > how's that necessary?  The replacement SLP node should end up with
> > just a SLP_TREE_REPRESENTATIVE (the IFN function call).
> > That is, the only thing necessary is verification / matching of the
> > appropriate "interleaving" scheme imposed by
> SLP_TREE_LANE_PERMUTATION.
> 
> But the order or the statements are important as those decide the
> LOAD_PERMUTATION that build_slp_tree creates.
> 
> So in this case the original statement is
> 
>stmt 0 _39 = _37 + _12;
>stmt 1 _6 = _38 - _36;
> 
> {_12, _36} result in a LOAD_PERMUTATION of {1, 0} because of how the
> addition is done.
> So to undo the LOAD_PERMUTE it has to build the new childr

RE: [PATCH v2 3/16]middle-end Add basic SLP pattern matching scaffolding.

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi Richi,

This is a respin which includes the changes you requested.

Thanks,
Tamar

gcc/ChangeLog:

* Makefile.in (tree-vect-slp-patterns.o): New.
* doc/passes.texi: Update documentation.
* tree-vect-slp.c (vect_print_slp_tree): Add new state.
(vect_match_slp_patterns_2, vect_match_slp_patterns): New.
(vect_analyze_slp): Call pattern matcher.
* tree-vectorizer.h (enum _complex_operation):
(class vect_pattern_match, class vect_pattern): New.
* tree-vect-slp-patterns.c: New file.

> -Original Message-
> From: rguent...@c653.arch.suse.de  On
> Behalf Of Richard Biener
> Sent: Tuesday, September 29, 2020 10:42 AM
> To: Richard Sandiford 
> Cc: Tamar Christina ; nd ; gcc-
> patc...@gcc.gnu.org
> Subject: Re: [PATCH v2 3/16]middle-end Add basic SLP pattern matching
> scaffolding.
> 
> On Tue, 29 Sep 2020, Richard Sandiford wrote:
> 
> > Richard Biener  writes:
> > >> > > @@ -2192,6 +2378,17 @@ vect_analyze_slp_instance (vec_info
> *vinfo,
> > >> > >   &tree_size, bst_map);
> > >> > >if (node != NULL)
> > >> > >  {
> > >> > > +  /* Temporarily allow add_stmt calls again.  */
> > >> > > +  vinfo->stmt_vec_info_ro = false;
> > >> > > +
> > >> > > +  /* See if any patterns can be found in the constructed SLP 
> > >> > > tree
> > >> > > +before we do any analysis on it.  */
> > >> > > +  vect_match_slp_patterns (node, vinfo, group_size,
> &max_nunits,
> > >> > > +  matches, &npermutes, &tree_size,
> > >> > > + bst_map);
> > >> > > +
> > >> > > +  /* After this no more add_stmt calls are allowed.  */
> > >> > > +  vinfo->stmt_vec_info_ro = true;
> > >> > > +
> > >> > >
> > >> > > I think this is a bit early to match patterns - I'd defer it to
> > >> > > the point where all entries into the same SLP subgraph are
> > >> > > analyzed, thus somewhere at the end of vect_analyze_slp loop
> > >> > > over all instances and match patterns?  That way phases are more
> clearly separated.
> > >> >
> > >> > That would probably work, my only worry is that the SLP analysis
> > >> > itself may fail and bail out at
> > >> >
> > >> >  /* If the loads and stores can be handled with load/store-lane
> > >> > instructions do not generate this SLP instance.  */
> > >> >  if (is_a  (vinfo)
> > >> >  && loads_permuted
> > >> >  && dr && vect_store_lanes_supported (vectype, group_size,
> > >> > false))
> > >
> > > Ah, that piece of code.  Yeah, I'm repeatedly running into it as
> > > well - it's a bad hack that stands in the way all the time :/
> >
> > At one point I was wondering about trying to drop the above, vectorise
> > with and without SLP, and then compare their costs, like for
> VECT_COMPARE_COSTS.
> > But that seemed like a dead end with the move to doing everything on
> > the SLP representation.
> 
> Yeah ... though even moving everything to the SLP representation will retain
> the issue since there it will be N group-size 1 SLP instances vs. 1 
> group-size N
> SLP instance.
> 
> > > I guess we should try moving this upward like to
> > > vect_analyze_loop_2 right before
> > >
> > >   /* Check the SLP opportunities in the loop, analyze and build SLP trees.
> > > */
> > >   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
> > >   if (!ok)
> > > return ok;
> > >
> > > and there check whether all grouped loads and stores can be handled
> > > with store-/load-lanes (and there are no SLP reduction chains?) in
> > > which case do not try to attempt SLP at all.  Because the testcases
> > > this check was supposed to change were all-load/store-lane or all
> > > SLP so the mixed case is probably not worth special casing.
> > >
> > > Since load-/store-lanes is an arm speciality I tried to only touch
> > > this fragile part with a ten-foot pole ;)  CCing Richard, if he acks
> > > the above I can produce a patch.
> >
> > Yeah, sounds good to me.  Probably also sorth checking whether the
> > likely_max iteration count is high enough to support group_size
> > vectors, if we have enough information to guess that.
> >
> > We could also get the gen* machinery to emit a macro that is true if
> > at least one load/store-lane pattern is defined, so that we can skip
> > the code for non-Arm targets.  I can do that as a follow-up.
> 
> I've had a second look and one complication is that we only elide the SLP
> node if any of the loads are permuted.  So if all loads/stores are unpermuted
> but load/store-lanes would work we'd keep the SLP node.
> 
> Of course without actually building the SLP node we don't know whether the
> loads will be permuted or not ...
> 
> But surely the current place for the check will cause some testcases to
> become hybrid vectorizations which is likely undesirable.
> 
> So we could move the check after all SLP discovery is completed and throw it
> all away if we can and should use load/store-lanes?
> But that

RE: [PATCH v2 8/16]middle-end: add Complex Multiply and Accumulate/Subtract and Multiply and Accumulate/Subtract with Conjucate detection

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi All,

This is a respin of the patch using the new approach.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* doc/md.texi: Document optabs.
* internal-fn.def (COMPLEX_FMA, COMPLEX_FMA_CONJ, COMPLEX_FMS,
COMPLEX_FMS_CONJ): New.
* optabs.def (cmla_optab, cmla_conj_optab, cmls_optab, cmls_conj_optab):
New.
* tree-vect-slp-patterns.c (class complex_fma_pattern,
complex_fma_pattern::matches): New.
(slp_patterns): Add complex_fma_pattern.

> -Original Message-
> From: Gcc-patches  On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:30 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; rguent...@suse.de; o...@ucw.cz
> Subject: [PATCH v2 8/16]middle-end: add Complex Multiply and
> Accumulate/Subtract and Multiply and Accumulate/Subtract with Conjucate
> detection
> 
> Hi All,
> 
> This patch adds pattern detections for the following operation:
> 
>   Complex FMLA, Conjucate FMLA of the second parameter and FMLS.
> 
> c += a * b, c += a * conj (b), c -= a * b and c -= a * conj (b)
> 
>   For the conjucate cases it supports under fast-math that the operands that
> is
>   being conjucated be flipped by flipping the arguments to the optab.  This
>   allows it to support c = conj (a) * b and c += conj (a) * b.
> 
>   where a, b and c are complex numbers.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * doc/md.texi: Document optabs.
>   * internal-fn.def (COMPLEX_FMA, COMPLEX_FMA_CONJ,
> COMPLEX_FMS,
>   COMPLEX_FMS_CONJ): New.
>   * optabs.def (cmla_optab, cmla_conj_optab, cmls_optab,
> cmls_conj_optab):
>   New.
>   * tree-vect-slp-patterns.c (class ComplexFMAPattern): New.
>   (slp_patterns): Add ComplexFMAPattern.
> 
> --
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index ddaf1abaccbd44dae11ea902ec38b474aacfb8e1..d8142f745050d963e8d15c7793fae06d9ad02020 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6143,6 +6143,50 @@ rotations @var{m} of 90 or 270.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmla@var{m}4} instruction pattern
+@item @samp{cmla@var{m}4}
+Perform a vector floating point multiply and accumulate of complex numbers
+in operand 0, operand 1 and operand 2.
+
+The instruction must perform the operation on data loaded contiguously into the
+vectors.
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmla_conj@var{m}4} instruction pattern
+@item @samp{cmla_conj@var{m}4}
+Perform a vector floating point multiply and accumulate of complex numbers
+in operand 0, operand 1 and the conjucate of operand 2.
+
+The instruction must perform the operation on data loaded contiguously into the
+vectors.
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmls@var{m}4} instruction pattern
+@item @samp{cmls@var{m}4}
+Perform a vector floating point multiply and subtract of complex numbers
+in operand 0, operand 1 and operand 2.
+
+The instruction must perform the operation on data loaded contiguously into the
+vectors.
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmls_conj@var{m}4} instruction pattern
+@item @samp{cmls_conj@var{m}4}
+Perform a vector floating point multiply and subtract of complex numbers
+in operand 0, operand 1 and the conjucate of operand 2.
+
+The instruction must perform the operation on data loaded contiguously into the
+vectors.
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{cmul@var{m}4} instruction pattern
 @item @samp{cmul@var{m}4}
 Perform a vector floating point multiplication of complex numbers in operand 0
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index cb41643f5e332518a0271bb8e1af4883c8bd6880..acb7d9f3bdc757437d5492a652144ba31c2ef702 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -288,6 +288,10 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
 
 /* Ternary math functions.  */
 DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST, cmla_conj, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS, ECF_CONST, cmls, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS_CONJ, ECF_CONST, cmls_conj, ternary)
 
 /* Unary integer ops.  */
 DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 9c267d422478d0011f288b1f5f62daabe3989ba7..19db9c00896cd08adfd20a01669990bbbebd79f1 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -294,6 +294,10 @@ OPTAB_D (cadd90_optab, "cadd90$a3")
 OPTAB_D (cadd270_optab, "cadd270$a3")

[PATCH v2 9/18]middle-end optimize slp simplify back to back permutes.

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi All,

This optimizes sequential permutes. i.e. if there are two permutes back to back
this function applies the permute of the parent to the child and removed the
parent.

If the resulting permute in the child is now a no-op.  Then the child is also
dropped from the graph and the parent's parent attached to the child's child.

This relies on the materialization point calculation in optimize SLP.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Tests are included as part of the final patch as they need the SLP pattern
matcher to insert permutes in between.

This allows us to remove useless permutes such as

ldr q0, [x0, x3]
ldr q2, [x1, x3]
trn1v1.4s, v0.4s, v0.4s
trn2v0.4s, v0.4s, v0.4s
trn1v0.4s, v1.4s, v0.4s
mov v1.16b, v3.16b
fcmla   v1.4s, v0.4s, v2.4s, #0
fcmla   v1.4s, v0.4s, v2.4s, #90
str q1, [x2, x3]

from the sequence the vectorizer puts out and give

ldr q0, [x0, x3]
ldr q2, [x1, x3]
mov v1.16b, v3.16b
fcmla   v1.4s, v0.4s, v2.4s, #0
fcmla   v1.4s, v0.4s, v2.4s, #90
str q1, [x2, x3]

instead

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-slp.c (vect_slp_tree_permute_noop_p): New.
(vect_optimize_slp): Optimize permutes.
(vectorizable_slp_permutation): Fix typo.

-- 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index c03fc2fbecad1a2219504ac9daae75495e691775..48f615e1952707de4827f0e69e443c0a7db27d81 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -290,6 +290,27 @@ vect_slp_tree_uniform_p (slp_tree node)
   return true;
 }
 
+/* Return true when the node is a permute node and the permutation the node
+   contains is a no-op.  */
+
+static bool
+vect_slp_tree_permute_noop_p (slp_tree node)
+{
+  gcc_assert (SLP_TREE_CODE (node) == VEC_PERM_EXPR);
+
+  if (!SLP_TREE_LANE_PERMUTATION (node).exists ())
+return true;
+
+  unsigned x, seed;
+  lane_permutation_t perms = SLP_TREE_LANE_PERMUTATION (node);
+  seed = perms[0].second;
+  for (x = 1; x < perms.length (); x++)
+if (perms[x].first != perms[0].first || perms[x].second != ++seed)
+  return false;
+
+  return true;
+}
+
 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
of the chain.  */
@@ -3150,6 +3171,41 @@ vect_optimize_slp (vec_info *vinfo)
 	/* For loads simply drop the permutation, the load permutation
 	   already performs the desired permutation.  */
 	;
+	  else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+	{
+	  /* If the node if already a permute node we just need to apply
+		 the permutation to the permute node itself.  */
+	  if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+ "simplifying permute node %p\n",
+ node);
+
+	  vect_slp_permute (perms[perm], SLP_TREE_LANE_PERMUTATION (node),
+true);
+
+	  /* If the remaining permute is a no-op then we can just drop the
+		 node instead of materializing it.  */
+	  if (vect_slp_tree_permute_noop_p (node))
+		{
+		  if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+ "removing unneeded permute node %p\n",
+ node);
+
+		   unsigned idx = SLP_TREE_LANE_PERMUTATION (node)[0].first;
+		   slp_tree value = SLP_TREE_CHILDREN (node)[idx];
+		   unsigned src = slpg->vertices[node->vertex].pred->src;
+		   slp_tree prev = vertices[src];
+		   unsigned dest;
+		   slp_tree tmp;
+		   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (prev), dest, tmp)
+		 if (tmp == node)
+		   {
+			  SLP_TREE_CHILDREN (prev)[dest] = value;
+			  break;
+		}
+		}
+	}
 	  else
 	{
 	  if (dump_enabled_p ())
@@ -5361,7 +5417,7 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
 	  if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "permutation requires at "
-			 "least three vectors");
+			 "least three vectors\n");
 	  gcc_assert (!gsi);
 	  return false;
 	}



[PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR.

2020-11-03 Thread Tamar Christina via Gcc-patches
Hi All,

This change allows one to simplify lane permutes that select from multiple load
leafs that load from the same DR group by promoting the VEC_PERM node into a
load itself and pushing the lane permute into it as a load permute.

This saves us from having to calculate where to materialize a new load node.
If the resulting loads are now unused they are freed and are removed from the
graph.

This allows us to handle cases where we would have generated:

moviv4.4s, 0
adrpx3, .LC0
ldr q5, [x3, #:lo12:.LC0]
mov x3, 0
.p2align 3,,7
.L2:
mov v0.16b, v4.16b
mov v3.16b, v4.16b
ldr q1, [x1, x3]
ldr q2, [x0, x3]
fcmla   v0.4s, v2.4s, v1.4s, #0
fcmla   v3.4s, v1.4s, v2.4s, #0
fcmla   v0.4s, v2.4s, v1.4s, #270
fcmla   v3.4s, v1.4s, v2.4s, #270
mov v1.16b, v3.16b
tbl v0.16b, {v0.16b - v1.16b}, v5.16b
str q0, [x2, x3]
add x3, x3, 16
cmp x3, 1600
bne .L2
ret

and instead generate

mov x3, 0
.p2align 3,,7
.L27:
ldr q0, [x2, x3]
ldr q1, [x0, x3]
ldr q2, [x1, x3]
fcmla   v0.2d, v1.2d, v2.2d, #0
fcmla   v0.2d, v1.2d, v2.2d, #270
str q0, [x2, x3]
add x3, x3, 16
cmp x3, 512
bne .L27
ret

This runs as a pre step such that permute simplification can still inspect this
permute is needed

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Tests are included as part of the final patch as they need the SLP pattern
matcher to insert permutes in between.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-slp.c (vect_optimize_slp): Promote permutes.

-- 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 48f615e1952707de4827f0e69e443c0a7db27d81..a3881b59b3c2aafb216ef320633255ed91f4dd45 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2937,6 +2937,7 @@ vect_optimize_slp (vec_info *vinfo)
   unsigned i;
   auto_vec vertices;
   auto_vec leafs;
+  hash_set > dup_leafs;
   vect_slp_build_vertices (vinfo, vertices, leafs);
 
   struct graph *slpg = new_graph (vertices.length ());
@@ -2954,12 +2955,14 @@ vect_optimize_slp (vec_info *vinfo)
   graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
 
   auto_sbitmap n_visited (vertices.length ());
+  auto_sbitmap n_replicated (vertices.length ());
   auto_sbitmap n_materialize (vertices.length ());
   auto_vec n_perm (vertices.length ());
   auto_vec > perms;
 
   bitmap_clear (n_visited);
   bitmap_clear (n_materialize);
+  bitmap_clear (n_replicated);
   n_perm.quick_grow_cleared (vertices.length ());
   perms.safe_push (vNULL); /* zero is no permute */
 
@@ -3000,6 +3003,11 @@ vect_optimize_slp (vec_info *vinfo)
   /* If there's no permute no need to split one out.  */
   if (!any_permute)
 	continue;
+
+  /* If the operation is a replication mark it for further inspection.  */
+  if (imin == imax)
+	dup_leafs.add (idx);
+
   /* If the span doesn't match we'd disrupt VF computation, avoid
 	 that for now.  */
   if (imax - imin + 1 != SLP_TREE_LANES (node))
@@ -3028,6 +3036,100 @@ vect_optimize_slp (vec_info *vinfo)
   n_perm[idx] = perms.length () - 1;
 }
 
+  /* Inspect all replication node and determine if they are connected to a
+ permute operation that may linearize the load.  */
+  for (hash_set >::iterator iter = dup_leafs.begin ();
+   iter != dup_leafs.end (); ++iter)
+{
+  int idx = *iter;
+
+  graph_edge *edge = slpg->vertices[idx].pred;
+  do
+	{
+	  slp_tree node = vertices[idx];
+	  unsigned src = edge->src;
+	  /* If we've visited the permute node already leave it alone.  This
+	 prevents us from re-inspecting it for every leafs that lead to it.  */
+	  if (bitmap_bit_p (n_replicated, src))
+	continue;
+
+	  slp_tree parent = vertices[src];
+	  bitmap_set_bit (n_replicated, src);
+
+	  if (!SLP_TREE_LANE_PERMUTATION (parent).exists ())
+	continue;
+
+	  /* Check if all edges lead to a load and that all the loads are
+	 coming from the same group.  */
+	  unsigned j;
+	  bool distribute_p = SLP_TREE_CHILDREN (parent).length () > 0;
+	  stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node);
+	  stmt_vec_info dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt);
+	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (parent), j, node)
+	{
+	  /* Check if this is one of the nodes we know have a replication
+		 operation.  */
+	  distribute_p = dup_leafs.contains (node->vertex);
+	  if (!distribute_p)
+		break;
+	  stmt_vec_info cur_dr_stmt = SLP_TREE_REPRESENTATIVE (node);
+	  cur_dr_stmt = DR_GROUP_FIRST_ELEMENT (cur_dr_stmt);
+	  distribute_p = dr_stmt == cur_dr_stmt;
+	  if (!distribute_p)
+		break;
+	}
+
+	  /* If we have a valid node to optimize, do it and disconnect it from
+	 

[PATCH] libgcc: Add a weak stub for __sync_synchronize

2020-11-03 Thread Bernd Edlinger
Hi,

this fixes a problem with a missing symbol __sync_synchronize
which happens when newlib is used together with libstdc++ for
the non-threaded simulator target arm-none-eabi.

There are several questions on stackoverflow about this issue.

I would like to add a weak symbol for this target, since this
is only a default implementation and not meant to override a
possibly more sophisticated synchronization function from the
c-runtime.


Regression tested successfully on arm-none-eabi with newlib-3.3.0.

Is it OK for trunk?


Thanks
Bernd.
From f8a3df552f4b98308096659c66b4c8ea68580f25 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger 
Date: Mon, 2 Nov 2020 11:43:44 +0100
Subject: [PATCH] libgcc: Add a weak stub for __sync_synchronize

This patch adds a default implementation for __sync_synchronize,
which prevents many unresolved symbol errors on arm-none-eabi.
This happens often in C++ programs even without any threading.

libgcc:
2020-11-02  Bernd Edlinger  

	* config.host: Use t-eabi for arm-none-eabi.
	* config/arm/t-eabi: New.
	* config/arm/eabi-sync.c: New.
---
 libgcc/config.host|  2 +-
 libgcc/config/arm/eabi-sync.c | 39 +++
 libgcc/config/arm/t-eabi  |  1 +
 3 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 libgcc/config/arm/eabi-sync.c
 create mode 100644 libgcc/config/arm/t-eabi

diff --git a/libgcc/config.host b/libgcc/config.host
index fd8e55e..e25abf4 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -495,7 +495,7 @@ arm*-*-eabi* | arm*-*-symbianelf* | arm*-*-rtems*)
 	tm_file="$tm_file arm/bpabi-lib.h"
 	case ${host} in
 	arm*-*-eabi* | arm*-*-rtems*)
-	  tmake_file="${tmake_file} arm/t-bpabi t-crtfm"
+	  tmake_file="${tmake_file} arm/t-bpabi t-crtfm arm/t-eabi"
 	  extra_parts="crtbegin.o crtend.o crti.o crtn.o"
 	  ;;
 	arm*-*-symbianelf*)
diff --git a/libgcc/config/arm/eabi-sync.c b/libgcc/config/arm/eabi-sync.c
new file mode 100644
index 000..bffdd4a
--- /dev/null
+++ b/libgcc/config/arm/eabi-sync.c
@@ -0,0 +1,39 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+.  */
+
+void __attribute__ ((weak))
+__sync_synchronize (void)
+{
+#if defined (__ARM_ARCH_6__) || defined (__ARM_ARCH_6J__)   \
+|| defined (__ARM_ARCH_6K__) || defined (__ARM_ARCH_6T2__)  \
+|| defined (__ARM_ARCH_6Z__) || defined (__ARM_ARCH_6ZK__)  \
+|| defined (__ARM_ARCH_7__) || defined (__ARM_ARCH_7A__)
+#if defined (__ARM_ARCH_7__) || defined (__ARM_ARCH_7A__)
+__asm __volatile ("dmb" : : : "memory");
+#else
+__asm __volatile ("mcr p15, 0, r0, c7, c10, 5" : : : "memory");
+#endif
+#else
+__asm __volatile ("nop" : : : "memory");
+#endif
+}
diff --git a/libgcc/config/arm/t-eabi b/libgcc/config/arm/t-eabi
new file mode 100644
index 000..556032f
--- /dev/null
+++ b/libgcc/config/arm/t-eabi
@@ -0,0 +1 @@
+LIB2ADD_ST += $(srcdir)/config/arm/eabi-sync.c
-- 
1.9.1



[PATCH] Tweaks to ranger cache

2020-11-03 Thread Andrew MacLeod via Gcc-patches

This patch does some minor tweaking to the ranger caches.

1 - the ssa_block_range class is a vector over the BB's and contains 
pointers to any calculated on-entry ranges for an ssa-name.    The class 
wasn't doing any bounds checking... Its a pretty controlled enviroment, 
but still... safer to have a check just in case we stumble across a 
place where we are working in a new BB unexpectedly.


2 - The on entry cache consist of a vector of ssa_block_ranges.. 
whenever a name is accessed for the first time, it's vector is 
allocated, full of NULL range pointers. .  There are many time we are 
making a query as to whether there is a range or not...   And we don't 
need to do the allocation in order to make these checks.  THis actually 
picked up a significant amount of time.   Enough to absorb some extra 
work I am going to add later :-)


3 - Ther Ranger cache was simply exporting its 3 component caches 
publicly for the ranger to consume until I got to fixing it. . This 
patch also privatizes the global range cache and the on-entry block class.
  - The global cache is now accessed thru a new ranger_cache get and 
set routine,
  -  and the on-entry cache was being accessed purely for its dump 
listing, so that is now hidden behind a new ranger cache dump facility.


There are no real functional changes.. this is all fairly superficial, 
but important for follow on stuff.  We'll see if the asserts trigger 
anywhere.


Bootstrapped on x86_64-pc-linux-gnu, no regressions.  pushed.

Andrew
commit d0a90d8e40a9024ed9297b63a34ac9b0f080ed5b
Author: Andrew MacLeod 
Date:   Mon Nov 2 13:06:46 2020 -0500

Tweaks to ranger cache

Add some bounds checking to ssa_block_ranges, and privatize the
ranges block cache and global cache, adding API points for accessing them.

* gimple-range-cache.h (block_range_cache): Add new entry point.
(ranger_cache): Privatize global abnd block cache members.
* gimple-range-cache.cc (ssa_block_ranges::set_bb_range): Add bounds
check.
(ssa_block_ranges::set_bb_varying): Ditto.
(ssa_block_ranges::get_bb_range): Ditto.
(ssa_block_ranges::bb_range_p): Ditto.
(block_range_cache::get_block_ranges): Fix formatting.
(block_range_cache::query_block_ranges): New.
(block_range_cache::get_bb_range): Use Query_block_ranges.
(block_range_cache::bb_range_p): Ditto.
(ranger_cache::dump): New.
(ranger_cache::get_global_range): New.
(ranger_cache::set_global_range): New.
* gimple-range.cc (gimple_ranger::range_of_expr): Use new API.
(gimple_ranger::range_of_stmt): Ditto.
(gimple_ranger::export_global_ranges): Ditto.
(gimple_ranger::dump): Ditto.

diff --git a/gcc/gimple-range-cache.cc b/gcc/gimple-range-cache.cc
index bc9243c1279..574debbc166 100644
--- a/gcc/gimple-range-cache.cc
+++ b/gcc/gimple-range-cache.cc
@@ -165,6 +165,7 @@ ssa_block_ranges::~ssa_block_ranges ()
 void
 ssa_block_ranges::set_bb_range (const basic_block bb, const irange &r)
 {
+  gcc_checking_assert ((unsigned) bb->index < m_tab.length ());
   irange *m = m_irange_allocator->allocate (r);
   m_tab[bb->index] = m;
 }
@@ -174,6 +175,7 @@ ssa_block_ranges::set_bb_range (const basic_block bb, const 
irange &r)
 void
 ssa_block_ranges::set_bb_varying (const basic_block bb)
 {
+  gcc_checking_assert ((unsigned) bb->index < m_tab.length ());
   m_tab[bb->index] = m_type_range;
 }
 
@@ -183,6 +185,7 @@ ssa_block_ranges::set_bb_varying (const basic_block bb)
 bool
 ssa_block_ranges::get_bb_range (irange &r, const basic_block bb)
 {
+  gcc_checking_assert ((unsigned) bb->index < m_tab.length ());
   irange *m = m_tab[bb->index];
   if (m)
 {
@@ -197,6 +200,7 @@ ssa_block_ranges::get_bb_range (irange &r, const 
basic_block bb)
 bool
 ssa_block_ranges::bb_range_p (const basic_block bb)
 {
+  gcc_checking_assert ((unsigned) bb->index < m_tab.length ());
   return m_tab[bb->index] != NULL;
 }
 
@@ -244,8 +248,8 @@ block_range_cache::~block_range_cache ()
   m_ssa_ranges.release ();
 }
 
-// Return a reference to the m_block_cache for NAME.  If it has not been
-// accessed yet, allocate it.
+// Return a reference to the ssa_block_cache for NAME.  If it has not been
+// accessed yet, allocate it first.
 
 ssa_block_ranges &
 block_range_cache::get_block_ranges (tree name)
@@ -255,11 +259,24 @@ block_range_cache::get_block_ranges (tree name)
 m_ssa_ranges.safe_grow_cleared (num_ssa_names + 1);
 
   if (!m_ssa_ranges[v])
-m_ssa_ranges[v] = new ssa_block_ranges (TREE_TYPE (name), 
m_irange_allocator);
-
+m_ssa_ranges[v] = new ssa_block_ranges (TREE_TYPE (name),
+   m_irange_allocator);
   return *(m_ssa_ranges[v]);
 }
 
+
+// Return a pointer to the ssa_block_cache for NAME.  If it has not been
+// accessed yet, return NULL.
+
+ssa_block_ranges *
+block_range_cach

[PATCH] More Ranger cache tweaks

2020-11-03 Thread Andrew MacLeod via Gcc-patches
The range on entry cache is managed by the ranger_cache class. Part of 
the service it provides is auto-filling the cache when a request is made 
for the range on entry to a block.


The Ranger makes sure the global range at the def statement has been 
calculated then submits a requests for the range on entry to the desired 
block. fill_block_cache() then propagates the range from the def point, 
thru successive blocks to the block being request, adjusting that range 
based on any outgoing ranges GORI reports in the intervening blocks.


During this propagation, no new ranges are calculated, only outgoing 
edge adjustments. During this process, if GORI discovers it is using a 
value during the outgoing_edge calculation that hasnt been evaluated 
yet, it marks it as a "poor value", and uses a best guess (VARYING wrost 
case) and continues.


Once the propagation is finished, any "poor values" which were 
discovered are properly evaluated by a ranger request for that value, 
and then the original range that was being propagated is re-evaluated on 
that edge to see if it changed the calculated result.  If ti did, this 
new value is re-propagated.


This is what I use too refer to as "iterative updating".  There is an 
iterative aspect to it, but it doesn't really continue iterating.. its 
really just propagation.


This patch
 1)  renames the iterative bits to "propagation".
 2) The propagation request is also split out from fill_block_cache 
into a routine  called propagate_updated_value(). This will only 
propagate the range to blocks which already have on-entry values, not 
places which haven't been looked at yet.
 3)  And finally the set_global_range routine now checks if there was 
already a global range set and calls propagate_updated_value() if there 
was.  This ensures that if there were any on-entry values set and there 
is a new global value, that is is properly propagated thru whatever 
cache entries there are.


The primary times that global values are currently re-evaluated occur 
during back-edge cycles when a "poor value" is calculated for 
improvements to keep going without having final resolution.


Bootstrapped on x86_64-pc-linux-gnu, no regressions.  pushed.

Andrew






commit 435b300906629e6ff46bf69ce5fd8b069f4cb731
Author: Andrew MacLeod 
Date:   Mon Nov 2 17:04:23 2020 -0500

More Ranger cache tweaks

This patch splits the individual value propagation out from 
fill_block_cache,
and calls it from set_global_value when the global value is updated.
This ensures the "current" global value is reflected in the on-entry cache.

* gimple-range-cache.cc (ssa_global_cache::get_global_range): Return
true if there was a previous range set.
(ranger_cache::ranger_cache): Take a gimple_ranger parameter.
(ranger_cache::set_global_range): Propagate the value if updating.
(ranger_cache::propagate_cache): Renamed from 
iterative_cache_update.
(ranger_cache::propagate_updated_value): New.  Split from:
(ranger_cache::fill_block_cache): Split out value propagator.
* gimple-range-cache.h (ssa_global_cache): Update prototypes.
(ranger_cache): Update prototypes.

diff --git a/gcc/gimple-range-cache.cc b/gcc/gimple-range-cache.cc
index 574debbc166..cca9025abba 100644
--- a/gcc/gimple-range-cache.cc
+++ b/gcc/gimple-range-cache.cc
@@ -419,8 +419,9 @@ ssa_global_cache::get_global_range (irange &r, tree name) 
const
 }
 
 // Set the range for NAME to R in the global cache.
+// Return TRUE if there was already a range set, otherwise false.
 
-void
+bool
 ssa_global_cache::set_global_range (tree name, const irange &r)
 {
   unsigned v = SSA_NAME_VERSION (name);
@@ -432,6 +433,7 @@ ssa_global_cache::set_global_range (tree name, const irange 
&r)
 *m = r;
   else
 m_tab[v] = m_irange_allocator->allocate (r);
+  return m != NULL;
 }
 
 // Set the range for NAME to R in the glonbal cache.
@@ -476,7 +478,7 @@ ssa_global_cache::dump (FILE *f)
 
 // --
 
-ranger_cache::ranger_cache (range_query &q) : query (q)
+ranger_cache::ranger_cache (gimple_ranger &q) : query (q)
 {
   m_workback.create (0);
   m_workback.safe_grow_cleared (last_basic_block_for_fn (cfun));
@@ -532,7 +534,18 @@ ranger_cache::get_global_range (irange &r, tree name) const
 void
 ranger_cache::set_global_range (tree name, const irange &r)
 {
-  m_globals.set_global_range (name, r);
+  if (m_globals.set_global_range (name, r))
+{
+  // If there was already a range set, propagate the new value.
+  basic_block bb = gimple_bb (SSA_NAME_DEF_STMT (name));
+  if (!bb)
+   bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+
+  if (DEBUG_RANGE_CACHE)
+   fprintf (dump_file, "   GLOBAL :");
+
+  propagate_updated_value (name, bb);
+}
 }
 
 // Push a request for a new lookup in block BB of name.  Return true if
@@ -660,11 +673,11 @@ ra

Move pass_oacc_device_lower after pass_graphite

2020-11-03 Thread Frederik Harwath

Hi,

as a first step towards enabling the use of Graphite for optimizing
OpenACC loops this patch moves the OpenACC device lowering after the
Graphite pass.  This means that the device lowering now takes place
after some crucial optimization passes. Thus new instances of those
passes are added inside of a new pass pass_oacc_functions which ensures
that they run on OpenACC functions only. The choice of the new position
for pass_oacc_device_lower is further constrainted by the need to
execute it before pass_vectorize.  This means that
pass_oacc_device_lower now runs inside of pass_tree_loop. A further
instance of the pass that handles functions without loops is added
inside of pass_tree_no_loop. Yet another pass instance that executes if
optimizations are disabled is included inside of a new
pass_no_optimizations.

The patch has been bootstrapped on x86_64-linux-gnu and tested with the
GCC testsuite and with the libgomp testsuite with nvptx and gcn
offloading.

The patch should have no impact on non-OpenACC user code. However the
new pass instances have changed the pass instance numbering and hence
the dump scanning commands in several tests had to be adjusted. I hope
that I found all that needed adjustment, but it is well possible that I
missed some tests that execute for particular targets or non-default
languages only. The resulting UNRESOLVED tests are usually easily fixed
by appending a pass number to the name of a pass that previously had no
number (e.g. "cunrolli" becomes "cunrolli1") or by incrementing the pass
number (e.g. "dce6" becomes "dce7") in a dump scanning command.

The patch leads to several new unresolved tests in the libgomp testsuite
which are caused by the combination of torture testing, missing cleanup
of the offload dump files, and the new pass numbering.  If a test that
uses, for instance, "-foffload=fdump-tree-oaccdevlow" gets compiled with
"-O0" and afterwards with "-O2", each run of the test executes different
instances of pass_oacc_device_lower and produces dumps whose names
differ only in the pass instance number.  The dump scanning command in
the second run fails, because the dump files do not get removed after
the first run and the command consequently matches two different dump
files.  This seems to be a known issue.  I am going to submit a patch
that implements the cleanup of the offload dumps soon.

I have tried to rule out performance regressions by running different
benchmark suites with nvptx and gcn offloading. Nevertheless, I think
that it makes sense to keep an eye on OpenACC performance in the close
future and revisit the optimizations that run on the device lowered
function if necessary.

Ok to include the patch in master?

Best regards,
Frederik


-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
>From 93fb166876a0540416e19c9428316d1370dd1e1b Mon Sep 17 00:00:00 2001
From: Frederik Harwath 
Date: Tue, 3 Nov 2020 12:58:37 +0100
Subject: [PATCH] Move pass_oacc_device_lower after pass_graphite

As a first step towards enabling the use of Graphite for optimizing
OpenACC loops, the OpenACC device lowering must be moved after the
Graphite pass.  This means that the device lowering now takes place
after some crucial optimization passes. Thus new instances of those
passes are added inside of a new pass pass_oacc_functions which
ensures that they execute on OpenACC functions only. The choice of the
new position for pass_oacc_device_lower is further constrainted by the
need to execute it before pass_vectorize.  This means that
pass_oacc_device_lower now runs inside of pass_tree_loop. A further
instance of the pass that handles functions without loops is added
inside of pass_tree_no_loop. Yet another pass instance that executes
if optimizations are disabled is included inside of a new
pass_no_optimizations.

2020-11-03  Frederik Harwath  
	Thomas Schwinge  

gcc/ChangeLog:

	* omp-general.c (oacc_get_fn_dim_size): Adapt.
	* omp-offload.c (pass_oacc_device_lower::clone) : New method.
	* passes.c (class pass_no_optimizations): New pass.
	(make_pass_no_optimizations): New static function.
	* passes.def: Move pass_oacc_device_lower into pass_tree_loop
	and add further instances to pass_tree_no_loop and to new pass
	pass_no_optimizations. Add new instances of
	pass_lower_complex, pass_ccp, pass_sink_code,
	pass_complete_unrolli, pass_backprop, pass_phiprop,
	pass_forwprop, pass_vrp, pass_dce, pass_loop_done,
	pass_loop_init, pass_fix_loops supporting the
	pass_oacc_device_lower instance in pass_tree_loop.
	* tree-pass.h (make_pass_oacc_functions): New static function.
	(make_pass_oacc_functions): New static function.
	* tree-ssa-loop-ivcanon.c (pass_complete_unroll::clone): New method.
	(pass_complete_unrolli::clone): New method.
	* tree-ssa-loop.c (pass_fix_loops::clone): New method.
	(pass_tree_loop_init::clone): New method.
	(pass_tree_loop_d

Re: [PATCH v4] builtins: (not just) rs6000: Add builtins for fegetround, feclearexcept and feraiseexcept [PR94193]

2020-11-03 Thread Raoni Fassina Firmino via Gcc-patches
On Wed, Oct 28, 2020 at 12:03:33PM -0500, Segher Boessenkool wrote:
> For raising invalid we could perhaps set VXSOFT, but is that okay for
> other libcs than just glibc?  So that can be a future improvement if it
> turns out to be useful, as we discussed elsewhere.

Yeap.

> With those tweaks, the rs6000 part of the patch is okay for trunk.
> Thanks!  (The generic part looks fine to me as well, but I am not
> maintainer of that.)

Thanks, I will send and updated version later today.


o/
Raoni


Re: [PATCH 1/5] [PR target/96342] Change field "simdlen" into poly_uint64

2020-11-03 Thread Richard Sandiford via Gcc-patches
"yangyang (ET)"  writes:
> Hi,
>
> I have revised the patch based on your suggestions. I use multiple_p instead 
> of !multiple_p if the eq situation is OK to make it easier to understand.
>
>> >> > if (n->simdclone->inbranch)
>> >> >   this_badness += 2048;
>> >> > int target_badness = targetm.simd_clone.usable (n); @@ -3988,19
>> >> > +3988,19 @@ vectorizable_simd_clone_call (vec_info *vinfo,
>> >> > +stmt_vec_info
>> >> stmt_info,
>> >> > arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type,
>> >> >
>> slp_node);
>> >> > if (arginfo[i].vectype == NULL
>> >> > -   || (simd_clone_subparts (arginfo[i].vectype)
>> >> > -   > bestn->simdclone->simdlen))
>> >> > +   || (known_gt (simd_clone_subparts (arginfo[i].vectype),
>> >> > + bestn->simdclone->simdlen)))
>> >>
>> >> Here too I think we want constant_multiple_p:
>> >>
>> >>   || !constant_multiple_p (bestn->simdclone->simdlen,
>> >>simd_clone_subparts
>> >> (arginfo[i].vectype))
>> >>
>> >
>> > Use multiple_p here since the multiple is not needed.
>>
>> True, but in the case of vectorisation, we need to generate a constant number
>> of copies at compile time.  If we don't enforce a constant multiple, we might
>> end up trying to use an Advanced SIMD routine when vectorising for SVE.
>>
>> The reason we don't have a two-argument version of constant_multiple_p is
>> that so far nothing has needed it (at least AFAIK).  There's no conceptual
>> problem with adding it though.  I'm happy to do that if it would help.
>>
>
> Two-argument versions of constant_multiple_p are added in the v3-patch. Could 
> you please check if the added versions are OK ?
>
> Bootstrap and tested on both aarch64 and x86 Linux platform, no new 
> regression witnessed.

Looks great, thanks.  Pushed to trunk.

Richard


Re: [Patch + RFC][contrib] gcc-changelog/git_commit.py: Check for missing description

2020-11-03 Thread Martin Liška

On 10/30/20 2:16 PM, Tobias Burnus wrote:

In terms of issues, it seems as if Ubuntu 20.04.1 LTS has a too
old unidiff – I copied the check from test_email.py and applied
it to git_email.py – otherwise, nearly all tests fail.


Hello.

Please apply this hunk separately, it's fine.



Still, I do see some fails – I have attached the fails I got.
(fails.log, second attachment)

Independent of that, I have now written a check for an empty
description.


Thanks for it.

I really think the check should support situations where a description
is provided on the next line (first after '\t', so not '\t*') as
you see in the failing test:

libiberty/ChangeLog:

PR lto/97290

* simple-object-elf.c (simple_object_elf_copy_lto_debug_sections):
Use sh_link of a .symtab_shndx section.

Martin



OK for the patch and thoughts about the fails?

Tobias

-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter




Re: [Patch + RFC][contrib] gcc-changelog/git_commit.py: Check for missing description

2020-11-03 Thread Martin Liška

On 10/30/20 2:16 PM, Tobias Burnus wrote:

In terms of issues, it seems as if Ubuntu 20.04.1 LTS has a too
old unidiff – I copied the check from test_email.py and applied
it to git_email.py – otherwise, nearly all tests fail.

Still, I do see some fails – I have attached the fails I got.
(fails.log, second attachment)

Independent of that, I have now written a check for an empty
description.

OK for the patch and thoughts about the fails?

Tobias

-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter


... and please move

+unidiff_supports_renaming = hasattr(PatchedFile(), 'is_rename')

to the toplevel of the file.

Thanks,
Martin


[PATCH][pushed] Add setup.cfg for pytest.

2020-11-03 Thread Martin Liška

It adds FLAKE8 automatically when pytest is run.

Installed.

contrib/ChangeLog:

* gcc-changelog/setup.cfg: New file.
---
 contrib/gcc-changelog/setup.cfg | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 contrib/gcc-changelog/setup.cfg

diff --git a/contrib/gcc-changelog/setup.cfg b/contrib/gcc-changelog/setup.cfg
new file mode 100644
index 000..9e4a0f6479c
--- /dev/null
+++ b/contrib/gcc-changelog/setup.cfg
@@ -0,0 +1,2 @@
+[tool:pytest]
+addopts = -vv --flake8
--
2.29.1



cpplib: Fix off-by-one error

2020-11-03 Thread Nathan Sidwell


I noticed a fencepost error in the preprocessor.  We should be
checking if the next char is at the limit, not the current char (which
can't be, because we're looking at it).

libcpp/
* lex.c (_cpp_clean_line): Fix DOS off-by-one error.

pushing to trunk
--
Nathan Sidwell
diff --git i/libcpp/lex.c w/libcpp/lex.c
index fb222924c8c..1d522030a3c 100644
--- i/libcpp/lex.c
+++ w/libcpp/lex.c
@@ -1062,7 +1062,7 @@ _cpp_clean_line (cpp_reader *pfile)
   d = (uchar *) s;
 
   /* Handle DOS line endings.  */
-  if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
+  if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
 	s++;
 }
 


c++: Make extern-C mismatch an error

2020-11-03 Thread Nathan Sidwell


duplicate_decls was being lenient about extern-c mismatches, allowing
you to have two declarations in the symbol table after emitting an
error.  This resulted in duplicate error messages in modules, when we
find the same problem multiple times.  Let's just not let that happen.

gcc/cp/
* decl.c (duplicate_decls): Return error_mark_node fo extern-c
mismatch.

pushing to trunk
--
Nathan Sidwell
diff --git i/gcc/cp/decl.c w/gcc/cp/decl.c
index 114e8d0cb01..9428fa05258 100644
--- i/gcc/cp/decl.c
+++ w/gcc/cp/decl.c
@@ -1776,7 +1776,7 @@ duplicate_decls (tree newdecl, tree olddecl, bool hiding, bool was_hidden)
 			newdecl);
 	  inform (olddecl_loc,
 		  "previous declaration %q#D", olddecl);
-	  return NULL_TREE;
+	  return error_mark_node;
 	}
 	  /* For function versions, params and types match, but they
 	 are not ambiguous.  */


c++: A couple of template instantiation cleanups

2020-11-03 Thread Nathan Sidwell


I noticed that we were handling lambda extra scope during template
instantiation in a different order to how we handle the non-template
case.  Reordered that for consistency.  Also some more RAII during
template instantiation.

gcc/cp/
* pt.c (tsubst_lambda_expr): Reorder extra-scope handling to match
the non-template case.
(instantiate_body): Move a couple of declarations to their
initializers.

pushing to trunk

--
Nathan Sidwell
diff --git i/gcc/cp/pt.c w/gcc/cp/pt.c
index aa162d2a4f9..2041b5635f0 100644
--- i/gcc/cp/pt.c
+++ w/gcc/cp/pt.c
@@ -19081,15 +19081,6 @@ tsubst_lambda_expr (tree t, tree args, tsubst_flags_t complain, tree in_decl)
   LAMBDA_EXPR_MUTABLE_P (r) = LAMBDA_EXPR_MUTABLE_P (t);
   LAMBDA_EXPR_INSTANTIATED (r) = true;
 
-  if (LAMBDA_EXPR_EXTRA_SCOPE (t) == NULL_TREE)
-/* A lambda in a default argument outside a class gets no
-   LAMBDA_EXPR_EXTRA_SCOPE, as specified by the ABI.  But
-   tsubst_default_argument calls start_lambda_scope, so we need to
-   specifically ignore it here, and use the global scope.  */
-record_null_lambda_scope (r);
-  else
-record_lambda_scope (r);
-
   gcc_assert (LAMBDA_EXPR_THIS_CAPTURE (t) == NULL_TREE
 	  && LAMBDA_EXPR_PENDING_PROXIES (t) == NULL);
 
@@ -19168,6 +19159,15 @@ tsubst_lambda_expr (tree t, tree args, tsubst_flags_t complain, tree in_decl)
   if (type == error_mark_node)
 return error_mark_node;
 
+  if (LAMBDA_EXPR_EXTRA_SCOPE (t) == NULL_TREE)
+/* A lambda in a default argument outside a class gets no
+   LAMBDA_EXPR_EXTRA_SCOPE, as specified by the ABI.  But
+   tsubst_default_argument calls start_lambda_scope, so we need to
+   specifically ignore it here, and use the global scope.  */
+record_null_lambda_scope (r);
+  else
+record_lambda_scope (r);
+
   /* Do this again now that LAMBDA_EXPR_EXTRA_SCOPE is set.  */
   determine_visibility (TYPE_NAME (type));
 
@@ -25571,9 +25571,6 @@ instantiate_body (tree pattern, tree args, tree d, bool nested_p)
 
   if (VAR_P (d))
 {
-  tree init;
-  bool const_init = false;
-
   /* Clear out DECL_RTL; whatever was there before may not be right
 	 since we've reset the type of the declaration.  */
   SET_DECL_RTL (d, NULL);
@@ -25583,7 +25580,8 @@ instantiate_body (tree pattern, tree args, tree d, bool nested_p)
 	 regenerate_decl_from_template so we don't need to
 	 push/pop_access_scope again here.  Pull it out so that
 	 cp_finish_decl can process it.  */
-  init = DECL_INITIAL (d);
+  bool const_init = false;
+  tree init = DECL_INITIAL (d);
   DECL_INITIAL (d) = NULL_TREE;
   DECL_INITIALIZED_P (d) = 0;
 


Re: [PATCH v3] rs6000: Use direct move for char/short vector CTOR [PR96933]

2020-11-03 Thread Segher Boessenkool
Hi!

On Tue, Nov 03, 2020 at 03:25:19PM +0800, Kewen.Lin wrote:
> > I'm trying to be stricter about the test cases.
> > 
> > +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
> > +/* { dg-require-effective-target powerpc_p9vector_ok } */
> > +/* { dg-options "-O2" } */
> > 
> > Why does this test has_arch_pwr9 instead of adding -mdejagnu-cpu=power9?
> 
> I thought using -mdejagnu-cpu=power9 would force the case run with
> power9 cpu all the time, while using has_arch_pwr9 seems to be more
> flexible, it can be compiled with power9 or later (like -mcpu=power10),
> we can check whether we generate unexpected code on power10 or later.
> Does it sound good?

It will not run at all if your compiler (or testsuite invocation) does
not use at least power9.  Since the default for powerpc64-linux is
power4, and that for powerpc64le-linux is power8, this will happen for
many people (not to mention that it is extra important to test the
default setup, of course).

It probably would be useful if there was some convenient way to say
"use at least -mcpu=power9 for this, but some later cpu is fine too" --
but there is no such thing yet.

Using something like that might cause more maintenance issues later, see
"pstb" below for example, but that is not really an argument against
fixing this.

> > +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
> > @@ -0,0 +1,63 @@
> > +/* { dg-do run } */
> > +/* { dg-require-effective-target p8vector_hw } */
> > +/* { dg-options "-O2" } */
> > 
> > Doesn't this need -mdejagnu-cpu=power8?
> 
> Thanks for catching!  Yes, it needs.  I was thinking to use one
> case for both Power8 and Power9 runs, it passed the testings on
> both machines.  But your question made me realize that it's
> incorrect when we are doing testing on Power8 but pass some
> external option like -mcpu=power9, it can generate power9 insns
> which are illegal on the machine.

If the compiler defaults to (say) -mcpu=power7, it will generate code
for that the way the testcase is set up now (and it will not run on
machines before power8, but that is separate).

> +  if (TARGET_DIRECT_MOVE && (mode == V16QImode || mode == V8HImode))
> +{
> +  rtx op[16];
> +  /* Force the values into word_mode registers.  */
> +  for (i = 0; i < n_elts; i++)
> + {
> +   rtx tmp = force_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
> +   if (TARGET_POWERPC64)
> + {
> +   op[i] = gen_reg_rtx (DImode);
> +   emit_insn (gen_zero_extendqidi2 (op[i], tmp));
> + }
> +   else
> + {
> +   op[i] = gen_reg_rtx (SImode);
> +   emit_insn (gen_zero_extendqisi2 (op[i], tmp));
> + }
> + }

TARGET_POWERPC64 should be TARGET_64BIT afaics?  (See below.)

You can use Pmode then, too.  The zero_extend thing can be handled by
changing
  (define_insn "zero_extendqi2"
to
  (define_insn "@zero_extendqi2"
(and no other changes needed), and then calling
  emit_insn (gen_zero_extendqi2 (Pmode, op[i], tmp));
(or so is the theory.  This might need some other changes, and also all
other gen_zero_extendqi* callers need to change, so that is a separate
patch if you want to try.  This isn't so bad right now.)

> +   for (i = 0; i < n_elts; i++)
> + {
> +   vr_qi[i] = gen_reg_rtx (V16QImode);
> +   if (TARGET_POWERPC64)
> + emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i]));
> +   else
> + emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i]));
> + }

TARGET_64BIT here as well.

TARGET_POWERPC64 means the current machine has the 64-bit insns.  It
does not mean the code will run in 64-bit mode (e.g. -m32 -mpowerpc64 is
just fine, and can be useful), but it also does not mean the OS (libc,
kernel, etc.) will actually save the full 64-bit registers -- making it
only useful on Darwin currently.

(You *can* run all of the testsuite flawlessly on Linux with those
options, but that only works because those are small, short-running
programs.  More "real", bigger and more complex programs fail in strange
and exciting ways!)

It's a pity the pre-p9 code cannot reuse most of what we do for p9.

> +(define_insn "p8_mtvsrwz_v16qisi2"
> +  [(set (match_operand:V16QI 0 "register_operand" "=wa")
> +(unspec:V16QI [(match_operand:SI 1 "register_operand" "r")]
> +   UNSPEC_P8V_MTVSRWZ))]
> +  "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> +  "mtvsrwz %x0,%1"
> +  [(set_attr "type" "mftgpr")])
> +
> +(define_insn "p8_mtvsrd_v16qidi2"
> +  [(set (match_operand:V16QI 0 "register_operand" "=wa")
> +(unspec:V16QI [(match_operand:DI 1 "register_operand" "r")]
> +   UNSPEC_P8V_MTVSRD))]
> +  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> +  "mtvsrd %x0,%1"
> +  [(set_attr "type" "mftgpr")])

TARGET_POWERPC64 is fine for these, btw.  You just cannot decide to put
a DImode in a register based on only this -- but if that 

c++: Directly fixup deferred eh-specs

2020-11-03 Thread Nathan Sidwell


eh-specifiers in a class definition are complete-definition contexts,
and we sometimes need to deferr their parsing.  We create a deferred
eh specifier, which can end up persisting in the type system due to
variants being created before the deferred parse.  This causes
problems in modules handling.

This patch adds fixup_deferred_exception_variants, which directly
modifies the variants of such an eh spec once parsed.  As commented,
the general case is quite hard, so it doesn't deal with everything.
But I do catch the cases I encountered (from the std library).

gcc/cp/
* cp-tree.h (fixup_deferred_exception_variants): Declare.
* parser.c (cp_parser_class_specifier_1): Call it when
completing deferred parses rather than creating a variant.
(cp_parser_member_declaration): Move comment from ...
(cp_parser_noexcept_specification_opt): ... here.  Refactor the
deferred parse.
* tree.c (fixup_deferred_exception_variants): New.

pushing to trunk

--
Nathan Sidwell
diff --git i/gcc/cp/cp-tree.h w/gcc/cp/cp-tree.h
index 334424770ef..26852f6f2e3 100644
--- i/gcc/cp/cp-tree.h
+++ w/gcc/cp/cp-tree.h
@@ -7360,6 +7360,7 @@ extern const char *cxx_printable_name_translate	(tree, int);
 extern tree canonical_eh_spec			(tree);
 extern tree build_cp_fntype_variant		(tree, cp_ref_qualifier, tree, bool);
 extern tree build_exception_variant		(tree, tree);
+extern void fixup_deferred_exception_variants   (tree, tree);
 extern tree bind_template_template_parm		(tree, tree);
 extern tree array_type_nelts_total		(tree);
 extern tree array_type_nelts_top		(tree);
diff --git i/gcc/cp/parser.c w/gcc/cp/parser.c
index dd8c4b56bd0..274797f1879 100644
--- i/gcc/cp/parser.c
+++ w/gcc/cp/parser.c
@@ -24334,8 +24334,12 @@ cp_parser_class_specifier_1 (cp_parser* parser)
 	  /* Now we can parse the noexcept-specifier.  */
 	  spec = cp_parser_late_noexcept_specifier (parser, spec);
 
-	  if (spec != error_mark_node)
-	TREE_TYPE (decl) = build_exception_variant (TREE_TYPE (decl), spec);
+	  if (spec == error_mark_node)
+	spec = NULL_TREE;
+
+	  /* Update the fn's type directly -- it might have escaped
+	 beyond this decl :(  */
+	  fixup_deferred_exception_variants (TREE_TYPE (decl), spec);
 
 	  /* Restore the state of local_variables_forbidden_p.  */
 	  parser->local_variables_forbidden_p = local_variables_forbidden_p;
@@ -25371,6 +25375,9 @@ cp_parser_member_declaration (cp_parser* parser)
 	  int ctor_dtor_or_conv_p;
 	  bool static_p = (decl_specifiers.storage_class == sc_static);
 	  cp_parser_flags flags = CP_PARSER_FLAGS_TYPENAME_OPTIONAL;
+	  /* We can't delay parsing for friends,
+		 alias-declarations, and typedefs, even though the
+		 standard seems to require it.  */
 	  if (!friend_p
 		  && !decl_spec_seq_has_spec_p (&decl_specifiers, ds_typedef))
 		flags |= CP_PARSER_FLAGS_DELAY_NOEXCEPT;
@@ -26059,19 +26066,14 @@ cp_parser_noexcept_specification_opt (cp_parser* parser,
 	 a class.  So, if the noexcept-specifier has the optional expression,
 	 just save the tokens, and reparse this after we're done with the
 	 class.  */
-  const bool literal_p
-	= ((cp_lexer_nth_token_is (parser->lexer, 3, CPP_NUMBER)
-	|| cp_lexer_nth_token_is (parser->lexer, 3, CPP_KEYWORD))
-	   && cp_lexer_nth_token_is (parser->lexer, 4, CPP_CLOSE_PAREN));
 
-  if (cp_lexer_nth_token_is (parser->lexer, 2, CPP_OPEN_PAREN)
+  if ((flags & CP_PARSER_FLAGS_DELAY_NOEXCEPT)
+	  && cp_lexer_nth_token_is (parser->lexer, 2, CPP_OPEN_PAREN)
 	  /* No need to delay parsing for a number literal or true/false.  */
-	  && !literal_p
+	  && !((cp_lexer_nth_token_is (parser->lexer, 3, CPP_NUMBER)
+		|| cp_lexer_nth_token_is (parser->lexer, 3, CPP_KEYWORD))
+	   && cp_lexer_nth_token_is (parser->lexer, 4, CPP_CLOSE_PAREN))
 	  && at_class_scope_p ()
-	  /* We don't delay parsing for friend member functions,
-	 alias-declarations, and typedefs, even though the standard seems
-	 to require it.  */
-	  && (flags & CP_PARSER_FLAGS_DELAY_NOEXCEPT)
 	  && TYPE_BEING_DEFINED (current_class_type)
 	  && !LAMBDA_TYPE_P (current_class_type))
 	return cp_parser_save_noexcept (parser);
diff --git i/gcc/cp/tree.c w/gcc/cp/tree.c
index 3087c4ab52c..7e763479f7a 100644
--- i/gcc/cp/tree.c
+++ w/gcc/cp/tree.c
@@ -2676,6 +2676,52 @@ build_cp_fntype_variant (tree type, cp_ref_qualifier rqual,
   return v;
 }
 
+/* TYPE is a function or method type with a deferred exception
+   specification that has been parsed to RAISES.  Fixup all the type
+   variants that are affected in place.  Via decltype &| noexcept
+   tricks, the unparsed spec could have escaped into the type system.
+   The general case is hard to fixup canonical types for.  */
+
+void
+fixup_deferred_exception_variants (tree type, tree raises)
+{
+  tree original = TYPE_RAISES_EXCEPTIONS (type);
+  tree cr = flag_noexcept_type ? canonical_eh_spec (raises) : NULL_TREE;
+
+  gcc_checking_assert (TREE_CODE (

Re: [PATCH] c++: Don't try to parse a function declaration as deduction guide [PR97663]

2020-11-03 Thread Marek Polacek via Gcc-patches
On Mon, Nov 02, 2020 at 08:13:09PM +0100, Jakub Jelinek via Gcc-patches wrote:
> Hi!
> 
> While these function declarations have NULL decl_specifiers->type,
> they have still type specifiers specified from which the default int
> in the return type is added, so we shouldn't try to parse those as
> deduction guides.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

This looks OK.

> 2020-11-02  Jakub Jelinek  
> 
>   PR c++/97663
>   * parser.c (cp_parser_init_declarator): Don't try to parse
>   C++17 deduction guides if there are any type specifiers even when
>   type is NULL.
> 
>   * g++.dg/cpp1z/class-deduction75.C: New test.
> 
> --- gcc/cp/parser.c.jj2020-11-02 09:24:31.0 +0100
> +++ gcc/cp/parser.c   2020-11-02 12:16:08.650452151 +0100
> @@ -20790,6 +20790,7 @@ cp_parser_init_declarator (cp_parser* pa
>  {
>/* Handle C++17 deduction guides.  */
>if (!decl_specifiers->type
> +   && !decl_specifiers->any_type_specifiers_p
> && ctor_dtor_or_conv_p <= 0
> && cxx_dialect >= cxx17)
>   {
> --- gcc/testsuite/g++.dg/cpp1z/class-deduction75.C.jj 2020-11-02 
> 12:28:23.234403625 +0100
> +++ gcc/testsuite/g++.dg/cpp1z/class-deduction75.C2020-11-02 
> 12:28:14.384500584 +0100
> @@ -0,0 +1,15 @@
> +// PR c++/97663
> +
> +template  struct foo {};
> +template  struct bar {};
> +template  struct baz {};
> +template  struct qux {};
> +template  struct corge {};
> +
> +namespace N {
> +  unsigned foo ();
> +  signed bar ();
> +  long baz ();
> +  long long qux ();
> +  short corge (); 
> +}
> 
>   Jakub
> 

Marek



Re: [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector

2020-11-03 Thread Dennis Zhang via Gcc-patches

On 11/3/20 2:05 PM, Richard Sandiford wrote:

Dennis Zhang  writes:

Hi Richard,

On 10/30/20 2:07 PM, Richard Sandiford wrote:

Dennis Zhang  writes:

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
b/gcc/config/aarch64/aarch64-simd-builtins.def
index 332a0b6b1ea..39ebb776d1d 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -719,6 +719,9 @@
 VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
 VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
   
+  /* Implemented by aarch64_vget_halfv8bf.  */

+  VAR1 (GETREG, vget_half, 0, ALL, v8bf)


This should be AUTO_FP, since it doesn't have any side-effects.
(As before, we should probably rename the flag, but that's separate work.)


+
 /* Implemented by aarch64_simd_mmlav16qi.  */
 VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
 VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd1e6f..f62c52ca327 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7159,6 +7159,19 @@
 [(set_attr "type" "neon_dot")]
   )
   
+;; vget_low/high_bf16

+(define_expand "aarch64_vget_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")
+   (match_operand:SI 2 "aarch64_zero_or_1")]
+  "TARGET_BF16_SIMD"
+{
+  int hbase = INTVAL (operands[2]);
+  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);


I think this needs to be:

aarch64_simd_vect_par_cnst_half

instead.  The issue is that on big-endian targets, GCC assumes vector
lane 0 is in the high part of the register, whereas for AArch64 it's
always in the low part of the register.  So we convert from AArch64
numbering to GCC numbering when generating the rtx and then take
endianness into account when matching the rtx later.

It would be good to have -mbig-endian tests that make sure we generate
the right instruction for each function (i.e. we get them the right way
round).  I guess it would be good to test that for little-endian too.



I've updated the expander using aarch64_simd_vect_par_cnst_half.
And the expander is divided into two for getting low and high half
seperately.
It's tested for aarch64-none-linux-gnu and aarch64_be-none-linux-gnu
targets with new tests including -mbig-endian option.


+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
+  DONE;
+})
+
   ;; bfmmla
   (define_insn "aarch64_bfmmlaqv4sf"
 [(set (match_operand:V4SF 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 215fcec5955..0c8bc2b0c73 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -84,6 +84,10 @@
 (ior (match_test "op == constm1_rtx")
  (match_test "op == const1_rtx"))
   
+(define_predicate "aarch64_zero_or_1"

+  (and (match_code "const_int")
+   (match_test "op == const0_rtx || op == const1_rtx")))


zero_or_1 looked odd to me, feels like it should be 0_or_1 or zero_or_one.
But I see that it's for consistency with aarch64_reg_zero_or_m1_or_1,
so let's keep it as-is.



This predicate is removed since there is no need of the imm operand in
the new expanders.

Thanks for the reviews.
Is it OK for trunk now?


Looks good.  OK for trunk and branches, thanks.

Richard



Thanks for approval, Richard!
This patch is committed at 3553c658533e430b232997bdfd97faf6606fb102

Bests
Dennis


Re: [patch] Add dg-require-effective-target fpic to gcc.target/powerpc tests

2020-11-03 Thread Segher Boessenkool
Hi!

On Tue, Nov 03, 2020 at 10:12:54AM +0100, Olivier Hainque wrote:
> This change is a proposal to add 
> 
>  /* { dg-require-effective-target fpic } */
> 
> to a few tests in gcc.target/powerpc that do use
> -fpic or -fPIC but don't currently query the target
> support.
> 
> This corresponds to what many other fpic tests do
> and helps the vxWorks ports at least, as -fpic is
> typically not supported in at least one of the two
> major modes of such port (kernel vs RTP).

This is fine for trunk, thanks!

> --- a/gcc/testsuite/gcc.target/powerpc/pr84112.c
> +++ b/gcc/testsuite/gcc.target/powerpc/pr84112.c
> @@ -1,4 +1,5 @@
>  /* { dg-do compile { target powerpc*-*-* } }*/
> +/* { dg-require-effective-target fpic } */
>  /* { dg-options "-mdejagnu-cpu=power8 -O3 -fstack-protector-strong -fpic" } 
> */

You could make that

/* { dg-do compile } */

at the same time, if you want?  If that is easy for you, don't bother
otherwise.


Segher


cpplib: EOF in pragmas

2020-11-03 Thread Nathan Sidwell


This patch moves the generation of PRAGMA_EOF earlier, to when we set
need_line, rather than when we try and get the next line.  It also
prevents peeking past a PRAGMA token.   

libcpp/
* lex.c (cpp_peek_token): Do not peek past CPP_PRAGMA.
(_cpp_lex_direct): Handle EOF in pragma when setting need_line,
not when needing a line.

pushing to trunk

--
Nathan Sidwell
diff --git i/libcpp/lex.c w/libcpp/lex.c
index fb222924c8c..8283c4e4a19 100644
--- i/libcpp/lex.c
+++ w/libcpp/lex.c
@@ -2554,6 +2554,15 @@ cpp_peek_token (cpp_reader *pfile, int index)
 	  index--;
 	  break;
 	}
+  else if (peektok->type == CPP_PRAGMA)
+	{
+	  /* Don't peek past a pragma.  */
+	  if (peektok == &pfile->directive_result)
+	/* Save the pragma in the buffer.  */
+	*pfile->cur_token++ = *peektok;
+	  index--;
+	  break;
+	}
 }
   while (index--);
 
@@ -2757,14 +2766,7 @@ _cpp_lex_direct (cpp_reader *pfile)
   buffer = pfile->buffer;
   if (buffer->need_line)
 {
-  if (pfile->state.in_deferred_pragma)
-	{
-	  result->type = CPP_PRAGMA_EOL;
-	  pfile->state.in_deferred_pragma = false;
-	  if (!pfile->state.pragma_allow_expansion)
-	pfile->state.prevent_expansion--;
-	  return result;
-	}
+  gcc_assert (!pfile->state.in_deferred_pragma);
   if (!_cpp_get_fresh_line (pfile))
 	{
 	  result->type = CPP_EOF;
@@ -2829,6 +2831,19 @@ _cpp_lex_direct (cpp_reader *pfile)
 	  && !CPP_OPTION (pfile, traditional)))
 	CPP_INCREMENT_LINE (pfile, 0);
   buffer->need_line = true;
+  if (pfile->state.in_deferred_pragma)
+	{
+	  /* Produce the PRAGMA_EOL on this line.  File reading
+	 ensures there is always a \n at end of the buffer, thus
+	 in a deferred pragma we always see CPP_PRAGMA_EOL before
+	 any CPP_EOF.  */
+	  result->type = CPP_PRAGMA_EOL;
+	  result->flags &= ~PREV_WHITE;
+	  pfile->state.in_deferred_pragma = false;
+	  if (!pfile->state.pragma_allow_expansion)
+	pfile->state.prevent_expansion--;
+	  return result;
+	}
   goto fresh_line;
 
 case '0': case '1': case '2': case '3': case '4':


[patch] g++ tests: Add dg-require-effective-target fpic to a few g++ tests

2020-11-03 Thread Olivier Hainque
Hello,

This change is a proposal to add 

  /* { dg-require-effective-target fpic } */

to a few tests in g++.dg that use -fpic or -fPIC
but don't currently query the target support.

This corresponds to what most other fpic tests do
(typically, unless this is implied by some other test
such as "target os is Linux") and helps the vxWorks
ports at least, as -fpic is not supported in one of
the two major modes of such port (kernel vs RTP).

Ok to commit ?

Thanks in advance!

Best Regards,

Olivier

2020-11-02  Olivier Hainque  

testsuite/
* g++.dg/pr57878.C: Add dg-require-effective-target fpic.
* g++.dg/pr65032.C: Likewise.
* g++.dg/pr84279.C: Likewise.
* g++.dg/inherit/thunk8.C: Likewise.
* g++.dg/inherit/opt/pr64411.C: Likewise.

diff --git a/gcc/testsuite/g++.dg/inherit/thunk8.C 
b/gcc/testsuite/g++.dg/inherit/thunk8.C
index ef645356898d..ecb9cbf37fee 100644
--- a/gcc/testsuite/g++.dg/inherit/thunk8.C
+++ b/gcc/testsuite/g++.dg/inherit/thunk8.C
@@ -4,6 +4,7 @@
 
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_thumb1_ok } */
+/* { dg-require-effective-target fpic } */
 /* { dg-options "-mthumb -fPIC" } */
 
 struct A {
diff --git a/gcc/testsuite/g++.dg/opt/pr64411.C 
b/gcc/testsuite/g++.dg/opt/pr64411.C
index 122b9eec414e..6ecc0a89de27 100644
--- a/gcc/testsuite/g++.dg/opt/pr64411.C
+++ b/gcc/testsuite/g++.dg/opt/pr64411.C
@@ -1,5 +1,6 @@
 // PR target/64411
 // { dg-do compile { target { { i?86-*-* x86_64-*-* } && lp64 } } }
+// { dg-require-effective-target fpic }
 // { dg-options "-Os -mcmodel=medium -fPIC -fschedule-insns 
-fselective-scheduling" }
 
 typedef __SIZE_TYPE__ size_t;
diff --git a/gcc/testsuite/g++.dg/pr57878.C b/gcc/testsuite/g++.dg/pr57878.C
index 5df2b7c9ef4e..ee9142b484bb 100644
--- a/gcc/testsuite/g++.dg/pr57878.C
+++ b/gcc/testsuite/g++.dg/pr57878.C
@@ -1,5 +1,6 @@
 /* { dg-do compile { target { { i?86-*-* x86_64-*-* } && ilp32 } } } */
 // { dg-require-effective-target c++11 }
+// { dg-require-effective-target fpic }
 /* { dg-options "-O2 -fno-omit-frame-pointer -fPIC" } */
 
 typedef int int32;
diff --git a/gcc/testsuite/g++.dg/pr65032.C b/gcc/testsuite/g++.dg/pr65032.C
index d6b6768d25a6..6e348f83a8e0 100644
--- a/gcc/testsuite/g++.dg/pr65032.C
+++ b/gcc/testsuite/g++.dg/pr65032.C
@@ -1,4 +1,5 @@
 // { dg-do compile { target i?86-*-* x86_64-*-* } }
+// { dg-require-effective-target fpic }
 // { dg-options "-Os -std=c++11 -fPIC -fstack-protector-strong 
-fomit-frame-pointer" }
 
 #pragma GCC visibility push(hidden)
diff --git a/gcc/testsuite/g++.dg/pr84279.C b/gcc/testsuite/g++.dg/pr84279.C
index a88d3fb84703..b2b5b8eabab1 100644
--- a/gcc/testsuite/g++.dg/pr84279.C
+++ b/gcc/testsuite/g++.dg/pr84279.C
@@ -1,6 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-require-effective-target fpic } */
 /* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
"-mcpu=power8" } } */
 /* { dg-options "-O3 -mcpu=power8 -g -fPIC -fvisibility=hidden 
-fstack-protector-strong" } */
 
-- 
2.17.1



Re: [patch] Add dg-require-effective-target fpic to gcc.target/powerpc tests

2020-11-03 Thread Olivier Hainque
Hi Segher,

> On 3 Nov 2020, at 18:00, Segher Boessenkool  
> wrote:
> 
>> /* { dg-require-effective-target fpic } */
>> 
>> to a few tests in gcc.target/powerpc that do use
>> -fpic or -fPIC but don't currently query the target
>> support.

> This is fine for trunk, thanks!

Great :-)

>> --- a/gcc/testsuite/gcc.target/powerpc/pr84112.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr84112.c
>> @@ -1,4 +1,5 @@
>> /* { dg-do compile { target powerpc*-*-* } }*/
>> +/* { dg-require-effective-target fpic } */
>> /* { dg-options "-mdejagnu-cpu=power8 -O3 -fstack-protector-strong -fpic" } 
>> */
> 
> You could make that
> 
> /* { dg-do compile } */
> 
> at the same time, if you want?  If that is easy for you, don't bother
> otherwise.

That's easy enough, will do.

Thanks for your prompt feedback!

Best Regards,

Olivier



[PATCH] "used" attribute saves decl from linker garbage collection

2020-11-03 Thread Jozef Lawrynowicz
The attached patch implements TARGET_ASM_MARK_DECL_PRESERVED for ELF GNU
OSABI targets, so that declarations that have the "used" attribute
applied will be saved from linker garbage collection.

TARGET_ASM_MARK_DECL_PRESERVED will emit an assembler ".retain"
directive for the decl, and the assembler will apply the SHF_GNU_RETAIN
flag to the section containing the decl.
The linker will not garbage collect sections marked with the
SHF_GNU_RETAIN flag.

SHF_GNU_RETAIN is a GNU OSABI ELF extension, and it was discussed on the
GNU gABI mailing list here:
https://sourceware.org/pipermail/gnu-gabi/2020q3/000429.html

The Binutils patch to implement .retain and other SHF_GNU_RETAIN
handling is posted here:
https://sourceware.org/pipermail/binutils/2020-November/113993.html

Successfully bootstrapped and regtested for x86_64-pc-linux-gnu, and
regtested for arm-none-eabi.

Ok for trunk?

Thanks,
Jozef
>From 0827e28480b7edd07cda4f938bdd14b1cbdf1fa2 Mon Sep 17 00:00:00 2001
From: Jozef Lawrynowicz 
Date: Thu, 29 Oct 2020 21:00:07 +
Subject: [PATCH] Implement TARGET_MARK_DECL_PRESERVED for ELF GNU OSABI
 targets

The GAS .retain directive will apply the SHF_GNU_RETAIN flag to the
section containing the symbol that must be preserved.

gcc/ChangeLog:

* config.in: Regenerate.
* config/elfos.h (TARGET_ASM_MARK_DECL_PRESERVED): Define for
HAVE_GAS_RETAIN.
* configure: Regenerate.
* configure.ac: Define HAVE_GAS_RETAIN.
* doc/extend.texi (used attribute): Document saving from linker garbage
collection.
* doc/sourcebuild.texi: Document "retain" effective target keyword.
* doc/tm.texi: Regenerate.
* output.h (default_elf_mark_decl_preserved): New.
* target.def (mark_decl_preserved): Mention GAS .retain directive.
* varasm.c (default_elf_mark_decl_preserved): New.

gcc/testsuite/ChangeLog:

* c-c++-common/attr-used-2.c: Test for .retain in assembler output.
* c-c++-common/attr-used.c: Likewise.
* lib/target-supports.exp (check_effective_target_retain): New.
---
 gcc/config.in|  6 
 gcc/config/elfos.h   |  7 +
 gcc/configure| 35 
 gcc/configure.ac |  8 ++
 gcc/doc/extend.texi  |  6 
 gcc/doc/sourcebuild.texi |  3 ++
 gcc/doc/tm.texi  |  2 +-
 gcc/output.h |  4 +++
 gcc/target.def   |  2 +-
 gcc/testsuite/c-c++-common/attr-used-2.c |  1 +
 gcc/testsuite/c-c++-common/attr-used.c   |  2 ++
 gcc/testsuite/lib/target-supports.exp|  9 ++
 gcc/varasm.c | 13 +
 13 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/gcc/config.in b/gcc/config.in
index 3657c46f349..8ef075a0ff3 100644
--- a/gcc/config.in
+++ b/gcc/config.in
@@ -1346,6 +1346,12 @@
 #endif
 
 
+/* Define if your assembler supports the .retain directive. */
+#ifndef USED_FOR_TARGET
+#undef HAVE_GAS_RETAIN
+#endif
+
+
 /* Define if your assembler supports specifying the exclude section flag. */
 #ifndef USED_FOR_TARGET
 #undef HAVE_GAS_SECTION_EXCLUDE
diff --git a/gcc/config/elfos.h b/gcc/config/elfos.h
index 74a3eafda6b..fab7b0e8ea4 100644
--- a/gcc/config/elfos.h
+++ b/gcc/config/elfos.h
@@ -474,3 +474,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 
 #undef TARGET_LIBC_HAS_FUNCTION
 #define TARGET_LIBC_HAS_FUNCTION no_c99_libc_has_function
+
+/* If the assembler supports the .retain directive for saving a symbol
+   from linker garbage collection, define this macro.  */
+#if HAVE_GAS_RETAIN
+#undef TARGET_ASM_MARK_DECL_PRESERVED
+#define TARGET_ASM_MARK_DECL_PRESERVED default_elf_mark_decl_preserved
+#endif
diff --git a/gcc/configure b/gcc/configure
index abff47d30eb..37488eac25d 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -24223,6 +24223,41 @@ cat >>confdefs.h <<_ACEOF
 _ACEOF
 
 
+# Test if the assembler supports the .retain directive for saving a symbol from
+# linker garbage collection.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for retain 
directive" >&5
+$as_echo_n "checking assembler for retain directive... " >&6; }
+if ${gcc_cv_as_retain_r+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  gcc_cv_as_retain_r=no
+  if test x$gcc_cv_as != x; then
+$as_echo '.retain retain_sym' > conftest.s
+if { ac_try='$gcc_cv_as $gcc_cv_as_flags  -o conftest.o conftest.s >&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then
+   gcc_cv_as_retain_r=yes
+else
+  echo "configure: failed program was" >&5
+  cat conftest.s >&5
+fi
+rm -f conftest.o conftest.s
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: resu

Re: [PATCH] "used" attribute saves decl from linker garbage collection

2020-11-03 Thread H.J. Lu via Gcc-patches
On Tue, Nov 3, 2020 at 9:41 AM Jozef Lawrynowicz
 wrote:
>
> The attached patch implements TARGET_ASM_MARK_DECL_PRESERVED for ELF GNU
> OSABI targets, so that declarations that have the "used" attribute
> applied will be saved from linker garbage collection.
>
> TARGET_ASM_MARK_DECL_PRESERVED will emit an assembler ".retain"

Can you use the "R" flag instead?

> directive for the decl, and the assembler will apply the SHF_GNU_RETAIN
> flag to the section containing the decl.
> The linker will not garbage collect sections marked with the
> SHF_GNU_RETAIN flag.
>
> SHF_GNU_RETAIN is a GNU OSABI ELF extension, and it was discussed on the
> GNU gABI mailing list here:
> https://sourceware.org/pipermail/gnu-gabi/2020q3/000429.html
>
> The Binutils patch to implement .retain and other SHF_GNU_RETAIN
> handling is posted here:
> https://sourceware.org/pipermail/binutils/2020-November/113993.html
>
> Successfully bootstrapped and regtested for x86_64-pc-linux-gnu, and
> regtested for arm-none-eabi.
>
> Ok for trunk?
>
> Thanks,
> Jozef



-- 
H.J.


Re: [PATCH, 1/3, OpenMP] Target mapping changes for OpenMP 5.0, front-end parts

2020-11-03 Thread Chung-Lin Tang

Hi Jakub,
here is v3 of this patch set.

On 2020/10/29 7:44 PM, Jakub Jelinek wrote:

+extern void c_omp_adjust_clauses (tree, bool);

So, can you please rename the function to either
c_omp_adjust_target_clauses or c_omp_adjust_mapping_clauses or
c_omp_adjust_map_clauses?


I've renamed it to 'c_omp_adjust_map_clauses'.


--- a/gcc/c-family/c-omp.c
+++ b/gcc/c-family/c-omp.c
@@ -2579,3 +2579,50 @@ c_omp_map_clause_name (tree clause, bool oacc)
  }
return omp_clause_code_name[OMP_CLAUSE_CODE (clause)];
  }
+
+/* Adjust map clauses after normal clause parsing, mainly to turn specific
+   base-pointer map cases into attach/detach and mark them addressable.  */
+void
+c_omp_adjust_clauses (tree clauses, bool is_target)
+{
+  for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+   && OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_POINTER

If this is only meant to handle decls, perhaps there should be
&& DECL_P (OMP_CLAUSE_DECL (c))
?


+   && TREE_CODE (TREE_TYPE (OMP_CLAUSE_DECL (c))) != ARRAY_TYPE)
+  {
+   tree ptr = OMP_CLAUSE_DECL (c);
+   bool ptr_mapped = false;
+   if (is_target)
+ {
+   for (tree m = clauses; m; m = OMP_CLAUSE_CHAIN (m))
+ if (OMP_CLAUSE_CODE (m) == OMP_CLAUSE_MAP
+ && OMP_CLAUSE_DECL (m) == ptr
+ && (OMP_CLAUSE_MAP_KIND (m) == GOMP_MAP_ALLOC
+ || OMP_CLAUSE_MAP_KIND (m) == GOMP_MAP_TO
+ || OMP_CLAUSE_MAP_KIND (m) == GOMP_MAP_FROM
+ || OMP_CLAUSE_MAP_KIND (m) == GOMP_MAP_TOFROM
+ || OMP_CLAUSE_MAP_KIND (m) == GOMP_MAP_ALWAYS_TO
+ || OMP_CLAUSE_MAP_KIND (m) == GOMP_MAP_ALWAYS_FROM
+ || OMP_CLAUSE_MAP_KIND (m) == GOMP_MAP_ALWAYS_TOFROM))
+   {
+ ptr_mapped = true;
+ break;
+   }

What you could e.g. do is have this loop at the start of function, with
&& DECL_P (OMP_CLAUSE_DECL (m))
instead of the == ptr check, and perhaps && POINTER_TYPE_P (TREE_TYPE
(OMP_CLAUSE_DECL (m))) check and set a bit in a bitmap for each such decl,
then in the GOMP_MAP_FIRSTPRIVATE_POINTER loop just check the bitmap.
Or, keep it in the loop like it is above, but populate the bitmap
lazily (upon seeing the first GOMP_MAP_FIRSTPRIVATE_POINTER) and for further
ones just use it.


I re-wrote c_omp_adjust_map_clauses to address the complexity issues you 
mentioned,
now it should be limited by a linear pass to collect and merge the firstprivate 
base
pointer + existence of a mapping of it, using a hash_map.

Patch set has been re-tested with no regressions for gcc, g++, gfortran, and 
libgomp.

Thanks,
Chung-Lin

gcc/c-family/
* c-common.h (c_omp_adjust_map_clauses): New declaration.
* c-omp.c (c_omp_adjust_map_clauses): New function.

gcc/c/
* c-parser.c (c_parser_omp_target_data): Add use of
new c_omp_adjust_map_clauses function. Add GOMP_MAP_ATTACH_DETACH as
handled map clause kind.
(c_parser_omp_target_enter_data): Likewise.
(c_parser_omp_target_exit_data): Likewise.
(c_parser_omp_target): Likewise.
* c-typeck.c (handle_omp_array_sections): Adjust COMPONENT_REF case to
use GOMP_MAP_ATTACH_DETACH map kind for C_ORT_OMP region type.
(c_finish_omp_clauses): Adjust bitmap checks to allow struct decl and
same struct field access to co-exist on OpenMP construct.

gcc/cp/
* parser.c (cp_parser_omp_target_data): Add use of
new c_omp_adjust_map_clauses function. Add GOMP_MAP_ATTACH_DETACH as
handled map clause kind.
(cp_parser_omp_target_enter_data): Likewise.
(cp_parser_omp_target_exit_data): Likewise.
(cp_parser_omp_target): Likewise.
* semantics.c (handle_omp_array_sections): Adjust COMPONENT_REF case to
use GOMP_MAP_ATTACH_DETACH map kind for C_ORT_OMP region type. Fix
interaction between reference case and attach/detach.
(finish_omp_clauses): Adjust bitmap checks to allow struct decl and
same struct field access to co-exist on OpenMP construct.
diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
index bb38e6c76a4..3eb909a2946 100644
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -1221,6 +1221,7 @@ extern enum omp_clause_defaultmap_kind 
c_omp_predetermined_mapping (tree);
 extern tree c_omp_check_context_selector (location_t, tree);
 extern void c_omp_mark_declare_variant (location_t, tree, tree);
 extern const char *c_omp_map_clause_name (tree, bool);
+extern void c_omp_adjust_map_clauses (tree, bool);
 
 /* Return next tree in the chain for chain_next walking of tree nodes.  */
 static inline tree
diff --git a/gcc/c-family/c-omp.c b/gcc/c-family/c-omp.c
index d7cff0f4cca..275c6afabe1 100644
--- a/gcc/c-family/c-omp.c
+++ b/gcc/c-family/c-omp.c
@@ -2579,3 +2579,92 @@ c_omp_m

Re: [PATCH, 2/3, OpenMP] Target mapping changes for OpenMP 5.0, middle-end parts and compiler testcases

2020-11-03 Thread Chung-Lin Tang

On 2020/10/29 7:49 PM, Jakub Jelinek wrote:

On Wed, Oct 28, 2020 at 06:32:21PM +0800, Chung-Lin Tang wrote:

@@ -8958,25 +9083,20 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq 
*pre_p,
  /* An "attach/detach" operation on an update directive should
 behave as a GOMP_MAP_ALWAYS_POINTER.  Beware that
 unlike attach or detach map kinds, GOMP_MAP_ALWAYS_POINTER
 depends on the previous mapping.  */
  if (code == OACC_UPDATE
  && OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH_DETACH)
OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_ALWAYS_POINTER);
- if (gimplify_expr (pd, pre_p, NULL, is_gimple_lvalue, fb_lvalue)
- == GS_ERROR)
-   {
- remove = true;
- break;
-   }

So what gimplifies those now?


They're gimplified somewhere during omp-low now.
(some gimplify scan testcases were adjusted to accommodate this change)

I don't remember the exact case I encountered, but there were some issues with 
gimplified
expressions inside the map clauses making some later checking more difficult. I 
haven't seen
any negative effect of this modification so far.


I don't like that, it goes against many principles, gimplification really
shouldn't leave around non-GIMPLE IL.
If you need to compare same expression or same expression bases later,
perhaps detect the equalities during gimplification before actually gimplifying 
the
clauses and ensure they are gimplified to the same expression or are using
same base (e.g. by adding SAVE_EXPRs or TARGET_EXPRs before the
gimplification).


I have moved that same gimplify_expr call down to below the processing block,
and things still work as expected. My aforementioned gimple-scan-test 
modifications
have all been reverted, and all original tests still pass correctly.

Thanks,
Chung-Lin

gcc/
* gimplify.c (is_or_contains_p): New static helper function.
(omp_target_reorder_clauses): New function.
(gimplify_scan_omp_clauses): Add use of omp_target_reorder_clauses to
reorder clause list according to OpenMP 5.0 rules. Add handling of
GOMP_MAP_ATTACH_DETACH for OpenMP cases.
* omp-low.c (is_omp_target): New static helper function.
(scan_sharing_clauses): Add scan phase handling of 
GOMP_MAP_ATTACH/DETACH
for OpenMP cases.
(lower_omp_target): Add lowering handling of GOMP_MAP_ATTACH/DETACH for
OpenMP cases.

gcc/testsuite/
* c-c++-common/gomp/clauses-2.c: Remove dg-error cases now valid.
* gfortran.dg/gomp/map-2.f90: Likewise.
* c-c++-common/gomp/map-5.c: New testcase.



diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 29f385c9368..c2500656193 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -8364,6 +8364,113 @@ extract_base_bit_offset (tree base, tree *base_ref, 
poly_int64 *bitposp,
   return base;
 }
 
+/* Returns true if EXPR is or contains (as a sub-component) BASE_PTR.  */
+
+static bool
+is_or_contains_p (tree expr, tree base_ptr)
+{
+  while (expr != base_ptr)
+if (TREE_CODE (base_ptr) == COMPONENT_REF)
+  base_ptr = TREE_OPERAND (base_ptr, 0);
+else
+  break;
+  return expr == base_ptr;
+}
+
+/* Implement OpenMP 5.x map ordering rules for target directives. There are
+   several rules, and with some level of ambiguity, hopefully we can at least
+   collect the complexity here in one place.  */
+
+static void
+omp_target_reorder_clauses (tree *list_p)
+{
+  /* Collect refs to alloc/release/delete maps.  */
+  auto_vec ard;
+  tree *cp = list_p;
+  while (*cp != NULL_TREE)
+if (OMP_CLAUSE_CODE (*cp) == OMP_CLAUSE_MAP
+   && (OMP_CLAUSE_MAP_KIND (*cp) == GOMP_MAP_ALLOC
+   || OMP_CLAUSE_MAP_KIND (*cp) == GOMP_MAP_RELEASE
+   || OMP_CLAUSE_MAP_KIND (*cp) == GOMP_MAP_DELETE))
+  {
+   /* Unlink cp and push to ard.  */
+   tree c = *cp;
+   tree nc = OMP_CLAUSE_CHAIN (c);
+   *cp = nc;
+   ard.safe_push (c);
+
+   /* Any associated pointer type maps should also move along.  */
+   while (*cp != NULL_TREE
+  && OMP_CLAUSE_CODE (*cp) == OMP_CLAUSE_MAP
+  && (OMP_CLAUSE_MAP_KIND (*cp) == GOMP_MAP_FIRSTPRIVATE_REFERENCE
+  || OMP_CLAUSE_MAP_KIND (*cp) == GOMP_MAP_FIRSTPRIVATE_POINTER
+  || OMP_CLAUSE_MAP_KIND (*cp) == GOMP_MAP_ATTACH_DETACH
+  || OMP_CLAUSE_MAP_KIND (*cp) == GOMP_MAP_POINTER
+  || OMP_CLAUSE_MAP_KIND (*cp) == GOMP_MAP_ALWAYS_POINTER
+  || OMP_CLAUSE_MAP_KIND (*cp) == GOMP_MAP_TO_PSET))
+ {
+   c = *cp;
+   nc = OMP_CLAUSE_CHAIN (c);
+   *cp = nc;
+   ard.safe_push (c);
+ }
+  }
+else
+  cp = &OMP_CLAUSE_CHAIN (*cp);
+
+  /* Link alloc/release/delete maps to the end of list.  */
+  for (unsigned int i = 0; i < ard.length (); i++)
+{
+   

Re: [PATCH, 3/3, OpenMP] Target mapping changes for OpenMP 5.0, libgomp parts [resend]

2020-11-03 Thread Chung-Lin Tang

On 2020/10/28 6:33 PM, Chung-Lin Tang wrote:

On 2020/9/1 9:37 PM, Chung-Lin Tang wrote:

his patch is the changes to libgomp and testcases.

There is now (again) a need to indicate OpenACC/OpenMP and
an 'enter data' style directive, so the associated changes to
'enum gomp_map_vars_kind'.

There is a slight change in the logic of gomp_attach_pointer
handling, because for OpenMP there might be a non-offloaded
data clause that attempts an attachment but silently continues
in case the pointer is not mapped.

Also in the testcases, an XFAILed testcase for structure element
mapping is added. OpenMP 5.0 specifies that a element of the same
structure variable are allocated/deallocated in a uniform fashion,
but this hasn't been implemented yet in this patch.


Hi Jakub,
you haven't reviewed this 3rd part yet, but still updating with a rebased patch 
here.

I've removed the above mentioned XFAILed testcase from the patch, since it 
actually
belongs in the structure element mapping patches instead of here.

Thanks,
Chung-Lin

 libgomp/
 * libgomp.h (enum gomp_map_vars_kind): Adjust enum values to be bit-flag
 usable.
 * oacc-mem.c (acc_map_data): Adjust gomp_map_vars argument flags to
 'GOMP_MAP_VARS_OPENACC | GOMP_MAP_VARS_ENTER_DATA'.
 (goacc_enter_datum): Likewise for call to gomp_map_vars_async.
 (goacc_enter_data_internal): Likewise.

 * target.c (gomp_map_vars_internal): Change checks of 
GOMP_MAP_VARS_ENTER_DATA
 to use bit-and (&). Adjust use of gomp_attach_pointer for OpenMP cases.
 (gomp_exit_data): Add handling of GOMP_MAP_DETACH.
 (GOMP_target_enter_exit_data): Add handling of GOMP_MAP_ATTACH.
 * testsuite/libgomp.c-c++-common/ptr-attach-1.c: New testcase.


For the libgomp patch, v3 doesn't update any of the code proper, but the
libgomp.c-c++-common/ptr-attach-1.c testcase had some code added to test the
case of a base-pointer on device by "declare target".

Thanks,
Chung-Lin


diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index da7ac037dcd..0cc3f4d406b 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1162,10 +1162,10 @@ struct gomp_device_descr
 /* Kind of the pragma, for which gomp_map_vars () is called.  */
 enum gomp_map_vars_kind
 {
-  GOMP_MAP_VARS_OPENACC,
-  GOMP_MAP_VARS_TARGET,
-  GOMP_MAP_VARS_DATA,
-  GOMP_MAP_VARS_ENTER_DATA
+  GOMP_MAP_VARS_OPENACC= 1,
+  GOMP_MAP_VARS_TARGET = 2,
+  GOMP_MAP_VARS_DATA   = 4,
+  GOMP_MAP_VARS_ENTER_DATA = 8
 };
 
 extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 65757ab2ffc..8dc521ac6d6 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -403,7 +403,8 @@ acc_map_data (void *h, void *d, size_t s)
 
   struct target_mem_desc *tgt
= gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
-&kinds, true, GOMP_MAP_VARS_ENTER_DATA);
+&kinds, true,
+GOMP_MAP_VARS_OPENACC | GOMP_MAP_VARS_ENTER_DATA);
   assert (tgt);
   assert (tgt->list_count == 1);
   splay_tree_key n = tgt->list[0].key;
@@ -572,7 +573,8 @@ goacc_enter_datum (void **hostaddrs, size_t *sizes, void 
*kinds, int async)
 
   struct target_mem_desc *tgt
= gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes,
-  kinds, true, GOMP_MAP_VARS_ENTER_DATA);
+  kinds, true,
+  GOMP_MAP_VARS_OPENACC | 
GOMP_MAP_VARS_ENTER_DATA);
   assert (tgt);
   assert (tgt->list_count == 1);
   n = tgt->list[0].key;
@@ -1202,7 +1204,7 @@ goacc_enter_data_internal (struct gomp_device_descr 
*acc_dev, size_t mapnum,
  struct target_mem_desc *tgt
= gomp_map_vars_async (acc_dev, aq, groupnum, &hostaddrs[i], NULL,
   &sizes[i], &kinds[i], true,
-  GOMP_MAP_VARS_ENTER_DATA);
+  GOMP_MAP_VARS_OPENACC | 
GOMP_MAP_VARS_ENTER_DATA);
  assert (tgt);
 
  gomp_mutex_lock (&acc_dev->lock);
diff --git a/libgomp/target.c b/libgomp/target.c
index 1a8c67c2df5..61dab064fae 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -683,7 +683,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
   struct target_mem_desc *tgt
 = gomp_malloc (sizeof (*tgt) + sizeof (tgt->list[0]) * mapnum);
   tgt->list_count = mapnum;
-  tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
+  tgt->refcount = (pragma_kind & GOMP_MAP_VARS_ENTER_DATA) ? 0 : 1;
   tgt->device_descr = devicep;
   tgt->prev = NULL;
   struct gomp_coalesce_buf cbuf, *cbufp = NULL;
@@ -1212,15 +1212,16 @@ gomp_map_vars_internal (struct gomp_device_descr 
*devicep,
  /* OpenACC 'attach'/'detach' doesn't affect
 structured/dynamic reference counts ('n->refcount',
 'n->dynamic_refcount

  1   2   >