Re: [PATCH][i386,AVX] Enable VNNI support [3/5]

2017-12-08 Thread Kirill Yukhin
On 24 Oct 10:57, Koval, Julia wrote:
> Hi,
> This patch enables VPDPBUSDS instruction. The doc for isaset and instruction: 
> https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
> 
> Ok for trunk?
Patch is OK. I've checked it in.

--
Thanks, K
> 
> Thanks,
> Julia
> 
> gcc/
>   * config/i386/avx512vnniintrin.h (_mm512_dpbusds_epi32,
>   _mm512_mask_dpbusds_epi32, _mm512_maskz_dpbusds_epi32): New.
>   * config/i386/avx512vnnivlintrin.h (_mm256_dpbusds_epi32,
>   _mm256_mask_dpbusds_epi32, _mm256_maskz_dpbusds_epi32,
>   _mm_dpbusds_epi32, _mm_mask_dpbusds_epi32,
>   _mm_maskz_dpbusds_epi32): New intrinsics.
> 
> gcc/testsuite/
>   * gcc.target/i386/avx512f-vnni-1.c: Add vpdpbusds check.
>   * gcc.target/i386/avx512vl-vnni-1.c: Ditto.
>   * gcc.target/i386/avx512f-vpdpbusds-2.c: New.
>   * gcc.target/i386/avx512vl-vpdpbusds-2.c: Ditto.




Re: [PATCH][i386,AVX] Enable VNNI support [4/5]

2017-12-08 Thread Kirill Yukhin
Hello Julia,
On 24 Oct 11:06, Koval, Julia wrote:
> Hi,
> This patch enables VPDPWSSD instruction. The doc for isaset and instruction: 
> https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
> 
> Ok for trunk?
Your patch is OK for trunk. I've checked it in.

--
Thanks, K


Re: [PATCH] Fix vectorizer part of PR81303

2017-12-08 Thread Richard Biener
On Thu, 7 Dec 2017, Bin.Cheng wrote:

> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener  wrote:
> >
> > The following fixes a vectorization issue that appears when trying
> > to vectorize the bwaves mat_times_vec kernel after interchange was
> > performed by the interchange pass.  That interchange inserts the
> > following code for the former reduction created by LIM store-motion:
> I do observe more cases are vectorized by this patch on AArch64.
> Still want to find a way not generating the cond_expr, but for the moment
> I will have another patch make interchange even more conservative for
> small cases.  In which the new cmp/select instructions do cost a lot against
> the small loop body.

Yeah.  I thought about what it takes to avoid the conditional - basically
we'd need to turn the init value to a (non-nested) loop that we'd need
to insert on the preheader of the outer loop.

Richard.  

> Thanks,
> bin
> >
> >[local count: 161061274]:
> >   # m_58 = PHI <1(10), m_84(20)>
> > ...
> >[local count: 912680551]:
> >   # l_35 = PHI <1(13), l_57(21)>
> > ...
> >   y__I_lsm.113_140 = *y_139(D)[_31];
> >   y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0;
> > ...
> >   *y_139(D)[_31] = _101;
> >
> >
> > so we have a COND_EXPR with a test on an integer IV m_58 with
> > double values.  Note that the m_58 != 1 condition is invariant
> > in the l loop.
> >
> > Currently we vectorize this condition using V8SImode vectors
> > causing a vectorization factor of 8 and thus forcing the scalar
> > path for the bwaves case (the loops have an iteration count of 5).
> >
> > The following patch makes the vectorizer handle invariant conditions
> > in the first place and second handle widening of operands of invariant
> > conditions transparently (the promotion will happen on the invariant
> > scalars).  This makes it possible to use a vectorization factor of 4,
> > reducing the bwaves runtime from 208s before interchange
> > (via 190s after interchange) to 172s after interchange and vectorization
> > with AVX256 (on a Haswell machine).
> >
> > For the vectorizable_condition part to work I need to avoid
> > pulling apart the condition from the COND_EXPR during pattern
> > detection.
> >
> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> >
> > Richard.
> >
> > 2017-12-06  Richard Biener  
> >
> > PR tree-optimization/81303
> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant
> > conditions try to create a comparison vector type matching
> > the data vector type.
> > (vectorizable_condition): Adjust.
> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern):
> > Leave invariant conditions alone in case we can vectorize those.
> >
> > * gcc.target/i386/vectorize9.c: New testcase.
> > * gcc.target/i386/vectorize10.c: New testcase.
> >
> > Index: gcc/tree-vect-stmts.c
> > ===
> > --- gcc/tree-vect-stmts.c   (revision 255438)
> > +++ gcc/tree-vect-stmts.c   (working copy)
> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_
> >
> >  static bool
> >  vect_is_simple_cond (tree cond, vec_info *vinfo,
> > -tree *comp_vectype, enum vect_def_type *dts)
> > +tree *comp_vectype, enum vect_def_type *dts,
> > +tree vectype)
> >  {
> >tree lhs, rhs;
> >tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info
> >  return false;
> >
> >*comp_vectype = vectype1 ? vectype1 : vectype2;
> > +  /* Invariant comparison.  */
> > +  if (! *comp_vectype)
> > +{
> > +  tree scalar_type = TREE_TYPE (lhs);
> > +  /* If we can widen the comparison to match vectype do so.  */
> > +  if (INTEGRAL_TYPE_P (scalar_type)
> > + && tree_int_cst_lt (TYPE_SIZE (scalar_type),
> > + TYPE_SIZE (TREE_TYPE (vectype
> > +   scalar_type = build_nonstandard_integer_type
> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))),
> > +  TYPE_UNSIGNED (scalar_type));
> > +  *comp_vectype = get_vectype_for_scalar_type (scalar_type);
> > +}
> > +
> >return true;
> >  }
> >
> > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi
> >else_clause = gimple_assign_rhs3 (stmt);
> >
> >if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo,
> > -   &comp_vectype, &dts[0])
> > +   &comp_vectype, &dts[0], vectype)
> >|| !comp_vectype)
> >  return false;
> >
> > Index: gcc/tree-vect-patterns.c
> > ===
> > --- gcc/tree-vect-patterns.c(revision 255438)
> > +++ gcc/tree-vect-patterns.c(working copy)
> > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec<
> >   || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBP

Re: [PATCH][i386,AVX] Enable VNNI support [5/5]

2017-12-08 Thread Kirill Yukhin
Hello Julia,
On 24 Oct 11:20, Koval, Julia wrote:
> Hi,
> This patch enables VPDPWSSDS instruction. The doc for isaset and instruction: 
> https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
> 
> Ok for trunk?
Your patch is OK for trunk. I've checked it in.

--
Thanks, K
> 
> Thanks,
> Julia


Re: [PATCH][GCC][ARM] Generate .arch and .arch_extensions for each function if required. [Patch (3/3)]

2017-12-08 Thread Christophe Lyon
Hi Tamar,


On 1 December 2017 at 09:57, Tamar Christina  wrote:
> Ping,
>
> This patch has also been bootstrapped and no issues.
>
>> -Original Message-
>> From: Tamar Christina
>> Sent: Tuesday, November 21, 2017 17:29
>> To: Tamar Christina ; gcc-patches@gcc.gnu.org
>> Cc: nd ; Ramana Radhakrishnan
>> ; Richard Earnshaw
>> ; ni...@redhat.com; Kyrylo Tkachov
>> 
>> Subject: RE: [PATCH][GCC][ARM] Generate .arch and .arch_extensions for
>> each function if required. [Patch (3/3)]
>>
>> Ping
>>
>> > -Original Message-
>> > From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
>> > ow...@gcc.gnu.org] On Behalf Of Tamar Christina
>> > Sent: Monday, November 6, 2017 16:52
>> > To: gcc-patches@gcc.gnu.org
>> > Cc: nd ; Ramana Radhakrishnan
>> > ; Richard Earnshaw
>> > ; ni...@redhat.com; Kyrylo Tkachov
>> > 
>> > Subject: [PATCH][GCC][ARM] Generate .arch and .arch_extensions for
>> > each function if required. [Patch (3/3)]
>> >
>> > Hi All,
>> >
>> > This patch adds the needed machinery to generate the appropriate .arch
>> > and .arch_extension directives per function.
>> >
>> > Borrowing from AArch64 this is only done when it's required (i.e. when
>> > the directives to be set differ from the currently set one).
>> >
>> > As part if this the .fpu directive has also been cleaned up to follow
>> > the same logic.
>> >
>> > Regtested on arm-none-eabi and no regressions.
>> >
>> > Ok for trunk?
>> >
>> > gcc/
>> > 2017-11-06  Tamar Christina  
>> >
>> > PR target/82641
>> > * config/arm/arm.c (INCLUDE_STRING): Define.
>> > (arm_last_printed_arch_string, arm_last_printed_fpu_string): New.
>> > (arm_declare_function_name): Conservatively emit .arch,
>> > .arch_extensions
>> > and .fpu.
>> >
>> > gcc/testsuite/
>> > 2017-11-06  Tamar Christina  
>> >
>> > PR target/82641
>> > * gcc.target/arm/pragma_arch_attribute_2.c: New.
>> > * gcc.target/arm/pragma_arch_attribute_2.c: New.
>> > * gcc.target/arm/pragma_arch_attribute_3.c: New.
>> > * gcc.target/arm/pragma_fpu_attribute.c: New.
>> > * gcc.target/arm/pragma_fpu_attribute_2.c: New.
>> >
>> > --

I'm afraid you'll have to update the testcases: they fail on non-hf targets
(arm-none-linux-gnueabi, arm-none-eabi), because:
In file included from /gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c:7:
/aci-gcc-fsf/builds/gcc-fsf-gccsrc/obj-arm-none-linux-gnueabi/gcc3/gcc/include/arm_neon.h:31:2:
error: #error "NEON intrinsics not available with the soft-float ABI.
Please use -mfloat-abi=softfp or -mfloat-abi=hard"
/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c:11:53: error:
unknown type name 'poly64x1_t'
/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c:11:71: error:
unknown type name 'poly64x1_t'

Looking at other attributes tests, maybe you need to add arm_neon_ok?

Thanks,
Christophe


[PATCH] Testcases for PR81303

2017-12-08 Thread Richard Biener

The following adds two testcases for PR81303, one verifying we interchange
the loop and a second that verifies we can vectorize the result.

Tested on x86_64-unknown-linux-gnu, applied.

Richard.

2017-12-08  Richard Biener  

PR tree-optimization/81303
* gfortran.dg/pr81303.f: New testcase.
* gfortran.dg/vect/pr81303.f: Likewise.

Index: gcc/testsuite/gfortran.dg/pr81303.f
===
--- gcc/testsuite/gfortran.dg/pr81303.f (nonexistent)
+++ gcc/testsuite/gfortran.dg/pr81303.f (working copy)
@@ -0,0 +1,44 @@
+! { dg-do compile }
+! { dg-options "-O3 -ffast-math -floop-interchange 
-fdump-tree-linterchange-details" }
+
+subroutine mat_times_vec(y,x,a,axp,ayp,azp,axm,aym,azm,
+ $  nb,nx,ny,nz)
+implicit none
+integer nb,nx,ny,nz,i,j,k,m,l,kit,im1,ip1,jm1,jp1,km1,kp1
+
+real*8 y(nb,nx,ny,nz),x(nb,nx,ny,nz)
+
+real*8 a(nb,nb,nx,ny,nz),
+ 1  axp(nb,nb,nx,ny,nz),ayp(nb,nb,nx,ny,nz),azp(nb,nb,nx,ny,nz),
+ 2  axm(nb,nb,nx,ny,nz),aym(nb,nb,nx,ny,nz),azm(nb,nb,nx,ny,nz)
+
+
+  do k=1,nz
+ km1=mod(k+nz-2,nz)+1
+ kp1=mod(k,nz)+1
+ do j=1,ny
+jm1=mod(j+ny-2,ny)+1
+jp1=mod(j,ny)+1
+do i=1,nx
+   im1=mod(i+nx-2,nx)+1
+   ip1=mod(i,nx)+1
+   do l=1,nb
+  y(l,i,j,k)=0.0d0
+  do m=1,nb
+ y(l,i,j,k)=y(l,i,j,k)+
+ 1   a(l,m,i,j,k)*x(m,i,j,k)+
+ 2   axp(l,m,i,j,k)*x(m,ip1,j,k)+
+ 3   ayp(l,m,i,j,k)*x(m,i,jp1,k)+
+ 4   azp(l,m,i,j,k)*x(m,i,j,kp1)+
+ 5   axm(l,m,i,j,k)*x(m,im1,j,k)+
+ 6   aym(l,m,i,j,k)*x(m,i,jm1,k)+
+ 7   azm(l,m,i,j,k)*x(m,i,j,km1)
+  enddo
+   enddo
+enddo
+ enddo
+enddo  
+return
+end
+
+! { dg-final { scan-tree-dump-times "is interchanged" 1 "linterchange" } }
Index: gcc/testsuite/gfortran.dg/vect/pr81303.f
===
--- gcc/testsuite/gfortran.dg/vect/pr81303.f(nonexistent)
+++ gcc/testsuite/gfortran.dg/vect/pr81303.f(working copy)
@@ -0,0 +1,50 @@
+! { dg-do compile }
+! { dg-require-effective-target vect_cond_mixed }
+! { dg-require-effective-target vect_double }
+! { dg-additional-options "-O3 -ffast-math -floop-interchange 
-fdump-tree-linterchange-details" }
+! vect_cond_mixed lies on x86, we cannot do vcond[_eq]v2div2df
+! { dg-additional-options "-msse4.1" { target { x86_64-*-* i?86-*-* } } }
+
+subroutine mat_times_vec(y,x,a,axp,ayp,azp,axm,aym,azm,
+ $  nb,nx,ny,nz)
+implicit none
+integer nb,nx,ny,nz,i,j,k,m,l,kit,im1,ip1,jm1,jp1,km1,kp1
+
+real*8 y(nb,nx,ny,nz),x(nb,nx,ny,nz)
+
+real*8 a(nb,nb,nx,ny,nz),
+ 1  axp(nb,nb,nx,ny,nz),ayp(nb,nb,nx,ny,nz),azp(nb,nb,nx,ny,nz),
+ 2  axm(nb,nb,nx,ny,nz),aym(nb,nb,nx,ny,nz),azm(nb,nb,nx,ny,nz)
+
+
+  do k=1,nz
+ km1=mod(k+nz-2,nz)+1
+ kp1=mod(k,nz)+1
+ do j=1,ny
+jm1=mod(j+ny-2,ny)+1
+jp1=mod(j,ny)+1
+do i=1,nx
+   im1=mod(i+nx-2,nx)+1
+   ip1=mod(i,nx)+1
+   do l=1,nb
+  y(l,i,j,k)=0.0d0
+  do m=1,nb
+ y(l,i,j,k)=y(l,i,j,k)+
+ 1   a(l,m,i,j,k)*x(m,i,j,k)+
+ 2   axp(l,m,i,j,k)*x(m,ip1,j,k)+
+ 3   ayp(l,m,i,j,k)*x(m,i,jp1,k)+
+ 4   azp(l,m,i,j,k)*x(m,i,j,kp1)+
+ 5   axm(l,m,i,j,k)*x(m,im1,j,k)+
+ 6   aym(l,m,i,j,k)*x(m,i,jm1,k)+
+ 7   azm(l,m,i,j,k)*x(m,i,j,km1)
+  enddo
+   enddo
+enddo
+ enddo
+enddo  
+return
+end
+
+! verify we can vectorize the inner loop after interchange
+! { dg-final { scan-tree-dump-times "is interchanged" 1 "linterchange" } }
+! { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } }


Re: [PATCH] Fix vectorizer part of PR81303

2017-12-08 Thread Christophe Lyon
On 8 December 2017 at 09:07, Richard Biener  wrote:
> On Thu, 7 Dec 2017, Bin.Cheng wrote:
>
>> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener  wrote:
>> >
>> > The following fixes a vectorization issue that appears when trying
>> > to vectorize the bwaves mat_times_vec kernel after interchange was
>> > performed by the interchange pass.  That interchange inserts the
>> > following code for the former reduction created by LIM store-motion:
>> I do observe more cases are vectorized by this patch on AArch64.
>> Still want to find a way not generating the cond_expr, but for the moment
>> I will have another patch make interchange even more conservative for
>> small cases.  In which the new cmp/select instructions do cost a lot against
>> the small loop body.
>
> Yeah.  I thought about what it takes to avoid the conditional - basically
> we'd need to turn the init value to a (non-nested) loop that we'd need
> to insert on the preheader of the outer loop.
>

Hi,

I noticed a regression on aarch64 after Bin's commit r255472:
gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+,
\\[x[0-9]+\\], [0-9]+
gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+,
\\[x[0-9]+, [0-9]+\\]!
gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s,
v[0-9]+.4s, v[0-9]+.s\\[0\\]

Is this patch supposed to fix it?

Thanks,

Christophe

> Richard.
>
>> Thanks,
>> bin
>> >
>> >[local count: 161061274]:
>> >   # m_58 = PHI <1(10), m_84(20)>
>> > ...
>> >[local count: 912680551]:
>> >   # l_35 = PHI <1(13), l_57(21)>
>> > ...
>> >   y__I_lsm.113_140 = *y_139(D)[_31];
>> >   y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0;
>> > ...
>> >   *y_139(D)[_31] = _101;
>> >
>> >
>> > so we have a COND_EXPR with a test on an integer IV m_58 with
>> > double values.  Note that the m_58 != 1 condition is invariant
>> > in the l loop.
>> >
>> > Currently we vectorize this condition using V8SImode vectors
>> > causing a vectorization factor of 8 and thus forcing the scalar
>> > path for the bwaves case (the loops have an iteration count of 5).
>> >
>> > The following patch makes the vectorizer handle invariant conditions
>> > in the first place and second handle widening of operands of invariant
>> > conditions transparently (the promotion will happen on the invariant
>> > scalars).  This makes it possible to use a vectorization factor of 4,
>> > reducing the bwaves runtime from 208s before interchange
>> > (via 190s after interchange) to 172s after interchange and vectorization
>> > with AVX256 (on a Haswell machine).
>> >
>> > For the vectorizable_condition part to work I need to avoid
>> > pulling apart the condition from the COND_EXPR during pattern
>> > detection.
>> >
>> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>> >
>> > Richard.
>> >
>> > 2017-12-06  Richard Biener  
>> >
>> > PR tree-optimization/81303
>> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant
>> > conditions try to create a comparison vector type matching
>> > the data vector type.
>> > (vectorizable_condition): Adjust.
>> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern):
>> > Leave invariant conditions alone in case we can vectorize those.
>> >
>> > * gcc.target/i386/vectorize9.c: New testcase.
>> > * gcc.target/i386/vectorize10.c: New testcase.
>> >
>> > Index: gcc/tree-vect-stmts.c
>> > ===
>> > --- gcc/tree-vect-stmts.c   (revision 255438)
>> > +++ gcc/tree-vect-stmts.c   (working copy)
>> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_
>> >
>> >  static bool
>> >  vect_is_simple_cond (tree cond, vec_info *vinfo,
>> > -tree *comp_vectype, enum vect_def_type *dts)
>> > +tree *comp_vectype, enum vect_def_type *dts,
>> > +tree vectype)
>> >  {
>> >tree lhs, rhs;
>> >tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
>> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info
>> >  return false;
>> >
>> >*comp_vectype = vectype1 ? vectype1 : vectype2;
>> > +  /* Invariant comparison.  */
>> > +  if (! *comp_vectype)
>> > +{
>> > +  tree scalar_type = TREE_TYPE (lhs);
>> > +  /* If we can widen the comparison to match vectype do so.  */
>> > +  if (INTEGRAL_TYPE_P (scalar_type)
>> > + && tree_int_cst_lt (TYPE_SIZE (scalar_type),
>> > + TYPE_SIZE (TREE_TYPE (vectype
>> > +   scalar_type = build_nonstandard_integer_type
>> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))),
>> > +  TYPE_UNSIGNED (scalar_type));
>> > +  *comp_vectype = get_vectype_for_scalar_type (scalar_type);
>> > +}
>> > +
>> >return true;
>> >  }
>> >
>> > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi
>> >else_clause = gimple_assign_rhs3 (stmt);
>> >
>> >if (!vect_is_simple_cond

Re: [PATCH] Fix vectorizer part of PR81303

2017-12-08 Thread Richard Biener
On Fri, 8 Dec 2017, Christophe Lyon wrote:

> On 8 December 2017 at 09:07, Richard Biener  wrote:
> > On Thu, 7 Dec 2017, Bin.Cheng wrote:
> >
> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener  wrote:
> >> >
> >> > The following fixes a vectorization issue that appears when trying
> >> > to vectorize the bwaves mat_times_vec kernel after interchange was
> >> > performed by the interchange pass.  That interchange inserts the
> >> > following code for the former reduction created by LIM store-motion:
> >> I do observe more cases are vectorized by this patch on AArch64.
> >> Still want to find a way not generating the cond_expr, but for the moment
> >> I will have another patch make interchange even more conservative for
> >> small cases.  In which the new cmp/select instructions do cost a lot 
> >> against
> >> the small loop body.
> >
> > Yeah.  I thought about what it takes to avoid the conditional - basically
> > we'd need to turn the init value to a (non-nested) loop that we'd need
> > to insert on the preheader of the outer loop.
> >
> 
> Hi,
> 
> I noticed a regression on aarch64 after Bin's commit r255472:
> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+,
> \\[x[0-9]+\\], [0-9]+
> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+,
> \\[x[0-9]+, [0-9]+\\]!
> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s,
> v[0-9]+.4s, v[0-9]+.s\\[0\\]
> 
> Is this patch supposed to fix it?

No, from what I can see the patch shouldn't affect it.  But it's not
clear what the testcase tests for - it just scans assembler.
Clearly we want to interchange the loop here so the scan assembler
needs to be adjusted and one has to revisit PR62178 to check whether
the result is still ok (or simply add -fno-loop-interchange to it).

Richard.

> Thanks,
> 
> Christophe
> 
> > Richard.
> >
> >> Thanks,
> >> bin
> >> >
> >> >[local count: 161061274]:
> >> >   # m_58 = PHI <1(10), m_84(20)>
> >> > ...
> >> >[local count: 912680551]:
> >> >   # l_35 = PHI <1(13), l_57(21)>
> >> > ...
> >> >   y__I_lsm.113_140 = *y_139(D)[_31];
> >> >   y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0;
> >> > ...
> >> >   *y_139(D)[_31] = _101;
> >> >
> >> >
> >> > so we have a COND_EXPR with a test on an integer IV m_58 with
> >> > double values.  Note that the m_58 != 1 condition is invariant
> >> > in the l loop.
> >> >
> >> > Currently we vectorize this condition using V8SImode vectors
> >> > causing a vectorization factor of 8 and thus forcing the scalar
> >> > path for the bwaves case (the loops have an iteration count of 5).
> >> >
> >> > The following patch makes the vectorizer handle invariant conditions
> >> > in the first place and second handle widening of operands of invariant
> >> > conditions transparently (the promotion will happen on the invariant
> >> > scalars).  This makes it possible to use a vectorization factor of 4,
> >> > reducing the bwaves runtime from 208s before interchange
> >> > (via 190s after interchange) to 172s after interchange and vectorization
> >> > with AVX256 (on a Haswell machine).
> >> >
> >> > For the vectorizable_condition part to work I need to avoid
> >> > pulling apart the condition from the COND_EXPR during pattern
> >> > detection.
> >> >
> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> >> >
> >> > Richard.
> >> >
> >> > 2017-12-06  Richard Biener  
> >> >
> >> > PR tree-optimization/81303
> >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant
> >> > conditions try to create a comparison vector type matching
> >> > the data vector type.
> >> > (vectorizable_condition): Adjust.
> >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern):
> >> > Leave invariant conditions alone in case we can vectorize those.
> >> >
> >> > * gcc.target/i386/vectorize9.c: New testcase.
> >> > * gcc.target/i386/vectorize10.c: New testcase.
> >> >
> >> > Index: gcc/tree-vect-stmts.c
> >> > ===
> >> > --- gcc/tree-vect-stmts.c   (revision 255438)
> >> > +++ gcc/tree-vect-stmts.c   (working copy)
> >> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_
> >> >
> >> >  static bool
> >> >  vect_is_simple_cond (tree cond, vec_info *vinfo,
> >> > -tree *comp_vectype, enum vect_def_type *dts)
> >> > +tree *comp_vectype, enum vect_def_type *dts,
> >> > +tree vectype)
> >> >  {
> >> >tree lhs, rhs;
> >> >tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
> >> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info
> >> >  return false;
> >> >
> >> >*comp_vectype = vectype1 ? vectype1 : vectype2;
> >> > +  /* Invariant comparison.  */
> >> > +  if (! *comp_vectype)
> >> > +{
> >> > +  tree scalar_type = TREE_TYPE (lhs);
> >> > +  /* If we can widen the comparison to match vectype do so.  */
> 

Re: [PATCH] avoid bogus -Wstringop-overflow for strncpy with _FORTIFY_SOURCE (PR 82646)

2017-12-08 Thread Christophe Lyon
On 7 December 2017 at 20:28, Martin Sebor  wrote:
> On 12/07/2017 06:46 AM, Christophe Lyon wrote:
>>
>> Hi Martin,
>>
>>
>> On 6 December 2017 at 00:51, Jeff Law  wrote:
>>>
>>> On 12/05/2017 04:47 PM, Martin Sebor wrote:

 PR middle-end/82646 - bogus -Wstringop-overflow with
 -D_FORTIFY_SOURCE=2 on strncpy with range to a member array,

 The bug points out a false positive in a call to strncpy() when
 _FORTIFY_SOURCE is defined that doesn't exist otherwise.

 The problem is that __builtin_strncpy buffer overflow checking
 is done along with the expansion of the intrinsic in one place
 and __builtin___strncpy_chk is handled differently in another,
 and the two are out of sync.

 The attached patch corrects the choice of arguments used for
 overflow detection in __builtin___strncpy_chk and aligns
 the diagnostics between the two intrinsics.

 Martin

 gcc-82646.diff


 PR tree-optimization/82646 - bogus -Wstringop-overflow with
 -D_FORTIFY_SOURCE=2 on strncpy with range to a member array

 gcc/ChangeLog:

   PR tree-optimization/82646
   * builtins.c (maybe_emit_chk_warning): Use size as the bound for
   strncpy, not maxlen.

 gcc/testsuite/ChangeLog:

   PR tree-optimization/82646
   * gcc.dg/builtin-stringop-chk-1.c: Adjust.
   * gcc.dg/builtin-stringop-chk-9.c: New test.
>>>
>>> OK.
>>>
>>
>> The new test fails on 32 bits platforms (arm, x86_32, aarch64 ilp32):
>> FAIL:gcc.dg/builtin-stringop-chk-9.c  (test for warnings, line 125)
>> FAIL:gcc.dg/builtin-stringop-chk-9.c  (test for warnings, line 133)
>> FAIL:gcc.dg/builtin-stringop-chk-9.c  (test for warnings, line 141)
>> FAIL:gcc.dg/builtin-stringop-chk-9.c  (test for warnings, line 149)
>
>
> I believe these failures were due to bug 83296 that Richard fixed
> earlier today.  With the change in my tree, the test passes for
> me with the arm-linux-gnueabi cross-compiler.  Can you please
> try again and let me know if the failures persist on any of your
> targets?
>

Indeed I can see it was fixed between r255460 and r255467,
and Richard's patch is in the range (r255466).

Thanks,

Christophe

> Thanks
> Martin


Re: [patch] prevent .cfi_personality/.cfi_lsda on !dwarf eh configurations

2017-12-08 Thread Olivier Hainque

> On Dec 7, 2017, at 23:57 , Jeff Law  wrote:
> 
>>* dwarf2out.c (dwarf2out_do_cfi_startproc): Only emit
>>.cfi_personality or .cfi_lsda if the eh data format is dwarf2.
>> 
> OK.

Now in, thanks Jeff!



Re: [PATCH 5/3] C++ bits to improve detection of attribute conflicts (PR 81544)

2017-12-08 Thread Rainer Orth
Hi Martin,

> Ping: https://gcc.gnu.org/ml/gcc-patches/2017-08/msg01034.html
>
> The attached C++ only patch is rebased on the top of trunk.
>
> The remaining patches in the series (C FE and the back ends)
> have been approved.

your patch broke Solaris bootstrap:

/vol/gcc/src/hg/trunk/local/gcc/config/sparc/sparc.c:699:1: error: missing 
initializer for member 'attribute_spec::exclude' 
[-Werror=missing-field-initializers]
 };
 ^
/vol/gcc/src/hg/trunk/local/gcc/config/sparc/sparc.c:699:1: error: missing 
initializer for member 'attribute_spec::exclude' 
[-Werror=missing-field-initializers]

/vol/gcc/src/hg/trunk/local/gcc/config/i386/i386.c:44761:1: error: missing 
initializer for member 'attribute_spec::exclude' 
[-Werror=missing-field-initializers]
 };
 ^
/vol/gcc/src/hg/trunk/local/gcc/config/i386/i386.c:44761:1: error: missing 
initializer for member 'attribute_spec::exclude' 
[-Werror=missing-field-initializers]


The line numbers are completely misleading, unfortunately.  Hadn't
SUBTARGET_ATTRIBUTE_TABLE been used at the end of the (very short)
sparc_attribute_table, I wouldn't have seen what was wrong.

The following patch fixes the problem, installed on mainline after
i386-pc-solaris2.11 and sparc-sun-solaris2.11 bootstraps completed
without regressions.

However, there are two more SUBTARGET_ATTRIBUTE_TABLE defines:

gcc/config/darwin.h:#define SUBTARGET_ATTRIBUTE_TABLE   
\
gcc/config/i386/cygming.h:#define SUBTARGET_ATTRIBUTE_TABLE \

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


2017-12-08  Rainer Orth  

* config/sol2.h (SOLARIS_ATTRIBUTE_TABLE): Initialize new member
of struct attribute_spec.

# HG changeset patch
# Parent  437a73acf1a5ffdc19298cad526f5f85d5496ed0
Initialize new member of SOLARIS_ATTRIBUTE_TABLE

diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h
--- a/gcc/config/sol2.h
+++ b/gcc/config/sol2.h
@@ -409,8 +409,8 @@ along with GCC; see the file COPYING3.  
 /* #pragma init and #pragma fini are implemented on top of init and
fini attributes.  */
 #define SOLARIS_ATTRIBUTE_TABLE		\
-  { "init",  0, 0, true,  false,  false, NULL, false },		\
-  { "fini",  0, 0, true,  false,  false, NULL, false }
+  { "init",  0, 0, true,  false,  false, NULL, false, NULL },	\
+  { "fini",  0, 0, true,  false,  false, NULL, false, NULL }
 
 /* Solaris-specific #pragmas are implemented on top of attributes.  Hook in
the bits from config/sol2.c.  */


Re: [PATCH] Fix vectorizer part of PR81303

2017-12-08 Thread Bin.Cheng
On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener  wrote:
> On Fri, 8 Dec 2017, Christophe Lyon wrote:
>
>> On 8 December 2017 at 09:07, Richard Biener  wrote:
>> > On Thu, 7 Dec 2017, Bin.Cheng wrote:
>> >
>> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener  wrote:
>> >> >
>> >> > The following fixes a vectorization issue that appears when trying
>> >> > to vectorize the bwaves mat_times_vec kernel after interchange was
>> >> > performed by the interchange pass.  That interchange inserts the
>> >> > following code for the former reduction created by LIM store-motion:
>> >> I do observe more cases are vectorized by this patch on AArch64.
>> >> Still want to find a way not generating the cond_expr, but for the moment
>> >> I will have another patch make interchange even more conservative for
>> >> small cases.  In which the new cmp/select instructions do cost a lot 
>> >> against
>> >> the small loop body.
>> >
>> > Yeah.  I thought about what it takes to avoid the conditional - basically
>> > we'd need to turn the init value to a (non-nested) loop that we'd need
>> > to insert on the preheader of the outer loop.
>> >
>>
>> Hi,
>>
>> I noticed a regression on aarch64 after Bin's commit r255472:
>> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+,
>> \\[x[0-9]+\\], [0-9]+
>> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+,
>> \\[x[0-9]+, [0-9]+\\]!
>> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s,
>> v[0-9]+.4s, v[0-9]+.s\\[0\\]
>>
>> Is this patch supposed to fix it?
>
> No, from what I can see the patch shouldn't affect it.  But it's not
> clear what the testcase tests for - it just scans assembler.
> Clearly we want to interchange the loop here so the scan assembler
I am not very sure.  Though interchanging gives better cache behavior,
but the loop is relatively small here,  the introduced cond_expr
results in two more instructions, as well as one additional memory
access from undoing reduction.  Together with addressing mode chosen
in ivopts, it leads to obvious regression.
Ah, another issue is the cond_expr blocks vectorization without your
patch here.  This case is what I meant small loops in which more
conservative interchange may be wanted.

Thanks,
bin

> needs to be adjusted and one has to revisit PR62178 to check whether
> the result is still ok (or simply add -fno-loop-interchange to it).
>
> Richard.
>
>> Thanks,
>>
>> Christophe
>>
>> > Richard.
>> >
>> >> Thanks,
>> >> bin
>> >> >
>> >> >[local count: 161061274]:
>> >> >   # m_58 = PHI <1(10), m_84(20)>
>> >> > ...
>> >> >[local count: 912680551]:
>> >> >   # l_35 = PHI <1(13), l_57(21)>
>> >> > ...
>> >> >   y__I_lsm.113_140 = *y_139(D)[_31];
>> >> >   y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0;
>> >> > ...
>> >> >   *y_139(D)[_31] = _101;
>> >> >
>> >> >
>> >> > so we have a COND_EXPR with a test on an integer IV m_58 with
>> >> > double values.  Note that the m_58 != 1 condition is invariant
>> >> > in the l loop.
>> >> >
>> >> > Currently we vectorize this condition using V8SImode vectors
>> >> > causing a vectorization factor of 8 and thus forcing the scalar
>> >> > path for the bwaves case (the loops have an iteration count of 5).
>> >> >
>> >> > The following patch makes the vectorizer handle invariant conditions
>> >> > in the first place and second handle widening of operands of invariant
>> >> > conditions transparently (the promotion will happen on the invariant
>> >> > scalars).  This makes it possible to use a vectorization factor of 4,
>> >> > reducing the bwaves runtime from 208s before interchange
>> >> > (via 190s after interchange) to 172s after interchange and vectorization
>> >> > with AVX256 (on a Haswell machine).
>> >> >
>> >> > For the vectorizable_condition part to work I need to avoid
>> >> > pulling apart the condition from the COND_EXPR during pattern
>> >> > detection.
>> >> >
>> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>> >> >
>> >> > Richard.
>> >> >
>> >> > 2017-12-06  Richard Biener  
>> >> >
>> >> > PR tree-optimization/81303
>> >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant
>> >> > conditions try to create a comparison vector type matching
>> >> > the data vector type.
>> >> > (vectorizable_condition): Adjust.
>> >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern):
>> >> > Leave invariant conditions alone in case we can vectorize those.
>> >> >
>> >> > * gcc.target/i386/vectorize9.c: New testcase.
>> >> > * gcc.target/i386/vectorize10.c: New testcase.
>> >> >
>> >> > Index: gcc/tree-vect-stmts.c
>> >> > ===
>> >> > --- gcc/tree-vect-stmts.c   (revision 255438)
>> >> > +++ gcc/tree-vect-stmts.c   (working copy)
>> >> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_
>> >> >
>> >> >  static bool
>> >> >  vect_is_simple_cond (tree cond, vec_

Re: [PATCH] Fix vectorizer part of PR81303

2017-12-08 Thread Richard Biener
On Fri, 8 Dec 2017, Bin.Cheng wrote:

> On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener  wrote:
> > On Fri, 8 Dec 2017, Christophe Lyon wrote:
> >
> >> On 8 December 2017 at 09:07, Richard Biener  wrote:
> >> > On Thu, 7 Dec 2017, Bin.Cheng wrote:
> >> >
> >> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener  
> >> >> wrote:
> >> >> >
> >> >> > The following fixes a vectorization issue that appears when trying
> >> >> > to vectorize the bwaves mat_times_vec kernel after interchange was
> >> >> > performed by the interchange pass.  That interchange inserts the
> >> >> > following code for the former reduction created by LIM store-motion:
> >> >> I do observe more cases are vectorized by this patch on AArch64.
> >> >> Still want to find a way not generating the cond_expr, but for the 
> >> >> moment
> >> >> I will have another patch make interchange even more conservative for
> >> >> small cases.  In which the new cmp/select instructions do cost a lot 
> >> >> against
> >> >> the small loop body.
> >> >
> >> > Yeah.  I thought about what it takes to avoid the conditional - basically
> >> > we'd need to turn the init value to a (non-nested) loop that we'd need
> >> > to insert on the preheader of the outer loop.
> >> >
> >>
> >> Hi,
> >>
> >> I noticed a regression on aarch64 after Bin's commit r255472:
> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+,
> >> \\[x[0-9]+\\], [0-9]+
> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+,
> >> \\[x[0-9]+, [0-9]+\\]!
> >> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s,
> >> v[0-9]+.4s, v[0-9]+.s\\[0\\]
> >>
> >> Is this patch supposed to fix it?
> >
> > No, from what I can see the patch shouldn't affect it.  But it's not
> > clear what the testcase tests for - it just scans assembler.
> > Clearly we want to interchange the loop here so the scan assembler
> I am not very sure.  Though interchanging gives better cache behavior,
> but the loop is relatively small here,  the introduced cond_expr
> results in two more instructions, as well as one additional memory
> access from undoing reduction.  Together with addressing mode chosen
> in ivopts, it leads to obvious regression.
> Ah, another issue is the cond_expr blocks vectorization without your
> patch here.  This case is what I meant small loops in which more
> conservative interchange may be wanted.

The loop has int data and int IVs so my patch shouldn't be necessary
to vectorize the loop.

Richard.

> Thanks,
> bin
> 
> > needs to be adjusted and one has to revisit PR62178 to check whether
> > the result is still ok (or simply add -fno-loop-interchange to it).
> >
> > Richard.
> >
> >> Thanks,
> >>
> >> Christophe
> >>
> >> > Richard.
> >> >
> >> >> Thanks,
> >> >> bin
> >> >> >
> >> >> >[local count: 161061274]:
> >> >> >   # m_58 = PHI <1(10), m_84(20)>
> >> >> > ...
> >> >> >[local count: 912680551]:
> >> >> >   # l_35 = PHI <1(13), l_57(21)>
> >> >> > ...
> >> >> >   y__I_lsm.113_140 = *y_139(D)[_31];
> >> >> >   y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0;
> >> >> > ...
> >> >> >   *y_139(D)[_31] = _101;
> >> >> >
> >> >> >
> >> >> > so we have a COND_EXPR with a test on an integer IV m_58 with
> >> >> > double values.  Note that the m_58 != 1 condition is invariant
> >> >> > in the l loop.
> >> >> >
> >> >> > Currently we vectorize this condition using V8SImode vectors
> >> >> > causing a vectorization factor of 8 and thus forcing the scalar
> >> >> > path for the bwaves case (the loops have an iteration count of 5).
> >> >> >
> >> >> > The following patch makes the vectorizer handle invariant conditions
> >> >> > in the first place and second handle widening of operands of invariant
> >> >> > conditions transparently (the promotion will happen on the invariant
> >> >> > scalars).  This makes it possible to use a vectorization factor of 4,
> >> >> > reducing the bwaves runtime from 208s before interchange
> >> >> > (via 190s after interchange) to 172s after interchange and 
> >> >> > vectorization
> >> >> > with AVX256 (on a Haswell machine).
> >> >> >
> >> >> > For the vectorizable_condition part to work I need to avoid
> >> >> > pulling apart the condition from the COND_EXPR during pattern
> >> >> > detection.
> >> >> >
> >> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> >> >> >
> >> >> > Richard.
> >> >> >
> >> >> > 2017-12-06  Richard Biener  
> >> >> >
> >> >> > PR tree-optimization/81303
> >> >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant
> >> >> > conditions try to create a comparison vector type matching
> >> >> > the data vector type.
> >> >> > (vectorizable_condition): Adjust.
> >> >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern):
> >> >> > Leave invariant conditions alone in case we can vectorize 
> >> >> > those.
> >> >> >
> >> >> > * gcc.target/i386/vectorize9.c: New testcase.
> >> >> > * gcc.target/i386/vec

Re: [PATCH] Fix vectorizer part of PR81303

2017-12-08 Thread Bin.Cheng
On Fri, Dec 8, 2017 at 9:54 AM, Richard Biener  wrote:
> On Fri, 8 Dec 2017, Bin.Cheng wrote:
>
>> On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener  wrote:
>> > On Fri, 8 Dec 2017, Christophe Lyon wrote:
>> >
>> >> On 8 December 2017 at 09:07, Richard Biener  wrote:
>> >> > On Thu, 7 Dec 2017, Bin.Cheng wrote:
>> >> >
>> >> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener  
>> >> >> wrote:
>> >> >> >
>> >> >> > The following fixes a vectorization issue that appears when trying
>> >> >> > to vectorize the bwaves mat_times_vec kernel after interchange was
>> >> >> > performed by the interchange pass.  That interchange inserts the
>> >> >> > following code for the former reduction created by LIM store-motion:
>> >> >> I do observe more cases are vectorized by this patch on AArch64.
>> >> >> Still want to find a way not generating the cond_expr, but for the 
>> >> >> moment
>> >> >> I will have another patch make interchange even more conservative for
>> >> >> small cases.  In which the new cmp/select instructions do cost a lot 
>> >> >> against
>> >> >> the small loop body.
>> >> >
>> >> > Yeah.  I thought about what it takes to avoid the conditional - 
>> >> > basically
>> >> > we'd need to turn the init value to a (non-nested) loop that we'd need
>> >> > to insert on the preheader of the outer loop.
>> >> >
>> >>
>> >> Hi,
>> >>
>> >> I noticed a regression on aarch64 after Bin's commit r255472:
>> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+,
>> >> \\[x[0-9]+\\], [0-9]+
>> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+,
>> >> \\[x[0-9]+, [0-9]+\\]!
>> >> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s,
>> >> v[0-9]+.4s, v[0-9]+.s\\[0\\]
>> >>
>> >> Is this patch supposed to fix it?
>> >
>> > No, from what I can see the patch shouldn't affect it.  But it's not
>> > clear what the testcase tests for - it just scans assembler.
>> > Clearly we want to interchange the loop here so the scan assembler
>> I am not very sure.  Though interchanging gives better cache behavior,
>> but the loop is relatively small here,  the introduced cond_expr
>> results in two more instructions, as well as one additional memory
>> access from undoing reduction.  Together with addressing mode chosen
>> in ivopts, it leads to obvious regression.
>> Ah, another issue is the cond_expr blocks vectorization without your
>> patch here.  This case is what I meant small loops in which more
>> conservative interchange may be wanted.
>
> The loop has int data and int IVs so my patch shouldn't be necessary
> to vectorize the loop.
I haven't got time look into vectorizer part yet, but there is below
in dump file:

pr62178.c:12:7: note: vect_is_simple_use: operand k_9
pr62178.c:12:7: note: def_stmt: k_9 = PHI <1(7), k_38(10)>
pr62178.c:12:7: note: type of def: external
pr62178.c:12:7: note: not vectorized: relevant stmt not supported:
r_I_I_lsm.0_19 = k_9 != 1 ? r_I_I_lsm.0_14 : 0;
pr62178.c:12:7: note: bad operation or unsupported loop bound.
pr62178.c:12:7: note: * Re-trying analysis with vector size 8

Thanks,
bin
>
> Richard.
>
>> Thanks,
>> bin
>>
>> > needs to be adjusted and one has to revisit PR62178 to check whether
>> > the result is still ok (or simply add -fno-loop-interchange to it).
>> >
>> > Richard.
>> >
>> >> Thanks,
>> >>
>> >> Christophe
>> >>
>> >> > Richard.
>> >> >
>> >> >> Thanks,
>> >> >> bin
>> >> >> >
>> >> >> >[local count: 161061274]:
>> >> >> >   # m_58 = PHI <1(10), m_84(20)>
>> >> >> > ...
>> >> >> >[local count: 912680551]:
>> >> >> >   # l_35 = PHI <1(13), l_57(21)>
>> >> >> > ...
>> >> >> >   y__I_lsm.113_140 = *y_139(D)[_31];
>> >> >> >   y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0;
>> >> >> > ...
>> >> >> >   *y_139(D)[_31] = _101;
>> >> >> >
>> >> >> >
>> >> >> > so we have a COND_EXPR with a test on an integer IV m_58 with
>> >> >> > double values.  Note that the m_58 != 1 condition is invariant
>> >> >> > in the l loop.
>> >> >> >
>> >> >> > Currently we vectorize this condition using V8SImode vectors
>> >> >> > causing a vectorization factor of 8 and thus forcing the scalar
>> >> >> > path for the bwaves case (the loops have an iteration count of 5).
>> >> >> >
>> >> >> > The following patch makes the vectorizer handle invariant conditions
>> >> >> > in the first place and second handle widening of operands of 
>> >> >> > invariant
>> >> >> > conditions transparently (the promotion will happen on the invariant
>> >> >> > scalars).  This makes it possible to use a vectorization factor of 4,
>> >> >> > reducing the bwaves runtime from 208s before interchange
>> >> >> > (via 190s after interchange) to 172s after interchange and 
>> >> >> > vectorization
>> >> >> > with AVX256 (on a Haswell machine).
>> >> >> >
>> >> >> > For the vectorizable_condition part to work I need to avoid
>> >> >> > pulling apart the condition from the COND_EXPR during pattern
>> >> >> > detection.
>> >> >> >
>> >> >> > Bootstrapped on x86_64-unknown-linux-gnu,

Re: [PATCH 5/3] C++ bits to improve detection of attribute conflicts (PR 81544)

2017-12-08 Thread Jakub Jelinek
On Fri, Dec 08, 2017 at 10:43:58AM +0100, Rainer Orth wrote:
> The line numbers are completely misleading, unfortunately.  Hadn't
> SUBTARGET_ATTRIBUTE_TABLE been used at the end of the (very short)
> sparc_attribute_table, I wouldn't have seen what was wrong.
> 
> The following patch fixes the problem, installed on mainline after
> i386-pc-solaris2.11 and sparc-sun-solaris2.11 bootstraps completed
> without regressions.
> 
> However, there are two more SUBTARGET_ATTRIBUTE_TABLE defines:
> 
> gcc/config/darwin.h:#define SUBTARGET_ATTRIBUTE_TABLE 
>   \
> gcc/config/i386/cygming.h:#define SUBTARGET_ATTRIBUTE_TABLE \

I'll deal with this.

> 2017-12-08  Rainer Orth  
> 
>   * config/sol2.h (SOLARIS_ATTRIBUTE_TABLE): Initialize new member
>   of struct attribute_spec.

Ok for trunk.

Jakub


Re: [PATCH 5/3] C++ bits to improve detection of attribute conflicts (PR 81544)

2017-12-08 Thread Rainer Orth
Jakub Jelinek  writes:

> On Fri, Dec 08, 2017 at 10:43:58AM +0100, Rainer Orth wrote:
>> The line numbers are completely misleading, unfortunately.  Hadn't
>> SUBTARGET_ATTRIBUTE_TABLE been used at the end of the (very short)
>> sparc_attribute_table, I wouldn't have seen what was wrong.
>> 
>> The following patch fixes the problem, installed on mainline after
>> i386-pc-solaris2.11 and sparc-sun-solaris2.11 bootstraps completed
>> without regressions.
>> 
>> However, there are two more SUBTARGET_ATTRIBUTE_TABLE defines:
>> 
>> gcc/config/darwin.h:#define SUBTARGET_ATTRIBUTE_TABLE \
>> gcc/config/i386/cygming.h:#define SUBTARGET_ATTRIBUTE_TABLE \
>
> I'll deal with this.

This is what I'm currently testing with an  x86_64-apple-darwin11.4.2
bootstrap.  Ok if it passes?

Thanks.

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


2017-12-08  Rainer Orth  

* config/darwin.h (SUBTARGET_ATTRIBUTE_TABLE): Initialize new
member of struct attribute_spec.

diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -742,11 +742,11 @@ extern GTY(()) section * darwin_sections
 /* Extra attributes for Darwin.  */
 #define SUBTARGET_ATTRIBUTE_TABLE	 \
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler, \
-   affects_type_identity } */		 \
+   affects_type_identity, exclude } */ \
   { "apple_kext_compatibility", 0, 0, false, true, false,		 \
-darwin_handle_kext_attribute, false }, \
+darwin_handle_kext_attribute, false, NULL },			 \
   { "weak_import", 0, 0, true, false, false, \
-darwin_handle_weak_import_attribute, false }
+darwin_handle_weak_import_attribute, false, NULL }
 
 /* Make local constant labels linker-visible, so that if one follows a
weak_global constant, ld64 will be able to separate the atoms.  */


Re: [PATCH] Fix vectorizer part of PR81303

2017-12-08 Thread Richard Biener
On Fri, 8 Dec 2017, Bin.Cheng wrote:

> On Fri, Dec 8, 2017 at 9:54 AM, Richard Biener  wrote:
> > On Fri, 8 Dec 2017, Bin.Cheng wrote:
> >
> >> On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener  wrote:
> >> > On Fri, 8 Dec 2017, Christophe Lyon wrote:
> >> >
> >> >> On 8 December 2017 at 09:07, Richard Biener  wrote:
> >> >> > On Thu, 7 Dec 2017, Bin.Cheng wrote:
> >> >> >
> >> >> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener  
> >> >> >> wrote:
> >> >> >> >
> >> >> >> > The following fixes a vectorization issue that appears when trying
> >> >> >> > to vectorize the bwaves mat_times_vec kernel after interchange was
> >> >> >> > performed by the interchange pass.  That interchange inserts the
> >> >> >> > following code for the former reduction created by LIM 
> >> >> >> > store-motion:
> >> >> >> I do observe more cases are vectorized by this patch on AArch64.
> >> >> >> Still want to find a way not generating the cond_expr, but for the 
> >> >> >> moment
> >> >> >> I will have another patch make interchange even more conservative for
> >> >> >> small cases.  In which the new cmp/select instructions do cost a lot 
> >> >> >> against
> >> >> >> the small loop body.
> >> >> >
> >> >> > Yeah.  I thought about what it takes to avoid the conditional - 
> >> >> > basically
> >> >> > we'd need to turn the init value to a (non-nested) loop that we'd need
> >> >> > to insert on the preheader of the outer loop.
> >> >> >
> >> >>
> >> >> Hi,
> >> >>
> >> >> I noticed a regression on aarch64 after Bin's commit r255472:
> >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+,
> >> >> \\[x[0-9]+\\], [0-9]+
> >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+,
> >> >> \\[x[0-9]+, [0-9]+\\]!
> >> >> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s,
> >> >> v[0-9]+.4s, v[0-9]+.s\\[0\\]
> >> >>
> >> >> Is this patch supposed to fix it?
> >> >
> >> > No, from what I can see the patch shouldn't affect it.  But it's not
> >> > clear what the testcase tests for - it just scans assembler.
> >> > Clearly we want to interchange the loop here so the scan assembler
> >> I am not very sure.  Though interchanging gives better cache behavior,
> >> but the loop is relatively small here,  the introduced cond_expr
> >> results in two more instructions, as well as one additional memory
> >> access from undoing reduction.  Together with addressing mode chosen
> >> in ivopts, it leads to obvious regression.
> >> Ah, another issue is the cond_expr blocks vectorization without your
> >> patch here.  This case is what I meant small loops in which more
> >> conservative interchange may be wanted.
> >
> > The loop has int data and int IVs so my patch shouldn't be necessary
> > to vectorize the loop.
> I haven't got time look into vectorizer part yet, but there is below
> in dump file:
> 
> pr62178.c:12:7: note: vect_is_simple_use: operand k_9
> pr62178.c:12:7: note: def_stmt: k_9 = PHI <1(7), k_38(10)>
> pr62178.c:12:7: note: type of def: external
> pr62178.c:12:7: note: not vectorized: relevant stmt not supported:
> r_I_I_lsm.0_19 = k_9 != 1 ? r_I_I_lsm.0_14 : 0;
> pr62178.c:12:7: note: bad operation or unsupported loop bound.
> pr62178.c:12:7: note: * Re-trying analysis with vector size 8

Yes, so neon doesn't have a way to vectorize such conditionals?
It does set vect_condition and even vect_cond_mixed though.
You'd have to dive into why it thinks it cannot vectorize it.

I suggest opening a bug if my patch didn't help.

Richard.

> Thanks,
> bin
> >
> > Richard.
> >
> >> Thanks,
> >> bin
> >>
> >> > needs to be adjusted and one has to revisit PR62178 to check whether
> >> > the result is still ok (or simply add -fno-loop-interchange to it).
> >> >
> >> > Richard.
> >> >
> >> >> Thanks,
> >> >>
> >> >> Christophe
> >> >>
> >> >> > Richard.
> >> >> >
> >> >> >> Thanks,
> >> >> >> bin
> >> >> >> >
> >> >> >> >[local count: 161061274]:
> >> >> >> >   # m_58 = PHI <1(10), m_84(20)>
> >> >> >> > ...
> >> >> >> >[local count: 912680551]:
> >> >> >> >   # l_35 = PHI <1(13), l_57(21)>
> >> >> >> > ...
> >> >> >> >   y__I_lsm.113_140 = *y_139(D)[_31];
> >> >> >> >   y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0;
> >> >> >> > ...
> >> >> >> >   *y_139(D)[_31] = _101;
> >> >> >> >
> >> >> >> >
> >> >> >> > so we have a COND_EXPR with a test on an integer IV m_58 with
> >> >> >> > double values.  Note that the m_58 != 1 condition is invariant
> >> >> >> > in the l loop.
> >> >> >> >
> >> >> >> > Currently we vectorize this condition using V8SImode vectors
> >> >> >> > causing a vectorization factor of 8 and thus forcing the scalar
> >> >> >> > path for the bwaves case (the loops have an iteration count of 5).
> >> >> >> >
> >> >> >> > The following patch makes the vectorizer handle invariant 
> >> >> >> > conditions
> >> >> >> > in the first place and second handle widening of operands of 
> >> >> >> > invariant
> >> >> >> > conditions transparently (the promotion will happen on the 
> >>

Re: [PATCH] Fix vectorizer part of PR81303

2017-12-08 Thread Bin.Cheng
On Fri, Dec 8, 2017 at 10:39 AM, Richard Biener  wrote:
> On Fri, 8 Dec 2017, Bin.Cheng wrote:
>
>> On Fri, Dec 8, 2017 at 9:54 AM, Richard Biener  wrote:
>> > On Fri, 8 Dec 2017, Bin.Cheng wrote:
>> >
>> >> On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener  wrote:
>> >> > On Fri, 8 Dec 2017, Christophe Lyon wrote:
>> >> >
>> >> >> On 8 December 2017 at 09:07, Richard Biener  wrote:
>> >> >> > On Thu, 7 Dec 2017, Bin.Cheng wrote:
>> >> >> >
>> >> >> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener  
>> >> >> >> wrote:
>> >> >> >> >
>> >> >> >> > The following fixes a vectorization issue that appears when trying
>> >> >> >> > to vectorize the bwaves mat_times_vec kernel after interchange was
>> >> >> >> > performed by the interchange pass.  That interchange inserts the
>> >> >> >> > following code for the former reduction created by LIM 
>> >> >> >> > store-motion:
>> >> >> >> I do observe more cases are vectorized by this patch on AArch64.
>> >> >> >> Still want to find a way not generating the cond_expr, but for the 
>> >> >> >> moment
>> >> >> >> I will have another patch make interchange even more conservative 
>> >> >> >> for
>> >> >> >> small cases.  In which the new cmp/select instructions do cost a 
>> >> >> >> lot against
>> >> >> >> the small loop body.
>> >> >> >
>> >> >> > Yeah.  I thought about what it takes to avoid the conditional - 
>> >> >> > basically
>> >> >> > we'd need to turn the init value to a (non-nested) loop that we'd 
>> >> >> > need
>> >> >> > to insert on the preheader of the outer loop.
>> >> >> >
>> >> >>
>> >> >> Hi,
>> >> >>
>> >> >> I noticed a regression on aarch64 after Bin's commit r255472:
>> >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+,
>> >> >> \\[x[0-9]+\\], [0-9]+
>> >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+,
>> >> >> \\[x[0-9]+, [0-9]+\\]!
>> >> >> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s,
>> >> >> v[0-9]+.4s, v[0-9]+.s\\[0\\]
>> >> >>
>> >> >> Is this patch supposed to fix it?
>> >> >
>> >> > No, from what I can see the patch shouldn't affect it.  But it's not
>> >> > clear what the testcase tests for - it just scans assembler.
>> >> > Clearly we want to interchange the loop here so the scan assembler
>> >> I am not very sure.  Though interchanging gives better cache behavior,
>> >> but the loop is relatively small here,  the introduced cond_expr
>> >> results in two more instructions, as well as one additional memory
>> >> access from undoing reduction.  Together with addressing mode chosen
>> >> in ivopts, it leads to obvious regression.
>> >> Ah, another issue is the cond_expr blocks vectorization without your
>> >> patch here.  This case is what I meant small loops in which more
>> >> conservative interchange may be wanted.
>> >
>> > The loop has int data and int IVs so my patch shouldn't be necessary
>> > to vectorize the loop.
>> I haven't got time look into vectorizer part yet, but there is below
>> in dump file:
>>
>> pr62178.c:12:7: note: vect_is_simple_use: operand k_9
>> pr62178.c:12:7: note: def_stmt: k_9 = PHI <1(7), k_38(10)>
>> pr62178.c:12:7: note: type of def: external
>> pr62178.c:12:7: note: not vectorized: relevant stmt not supported:
>> r_I_I_lsm.0_19 = k_9 != 1 ? r_I_I_lsm.0_14 : 0;
>> pr62178.c:12:7: note: bad operation or unsupported loop bound.
>> pr62178.c:12:7: note: * Re-trying analysis with vector size 8
>
> Yes, so neon doesn't have a way to vectorize such conditionals?
> It does set vect_condition and even vect_cond_mixed though.
It should.  IIRC, it was me enables vect_cond* stuff on AArch64?  Will
look into it after fixing more urgent bugs.
> You'd have to dive into why it thinks it cannot vectorize it.
>
> I suggest opening a bug if my patch didn't help.
You patch does help vectorizing the loop, but not sure if it's in a
perfect way.  I guess vect_factor is a problem here.

Thanks,
bin
>
> Richard.
>
>> Thanks,
>> bin
>> >
>> > Richard.
>> >
>> >> Thanks,
>> >> bin
>> >>
>> >> > needs to be adjusted and one has to revisit PR62178 to check whether
>> >> > the result is still ok (or simply add -fno-loop-interchange to it).
>> >> >
>> >> > Richard.
>> >> >
>> >> >> Thanks,
>> >> >>
>> >> >> Christophe
>> >> >>
>> >> >> > Richard.
>> >> >> >
>> >> >> >> Thanks,
>> >> >> >> bin
>> >> >> >> >
>> >> >> >> >[local count: 161061274]:
>> >> >> >> >   # m_58 = PHI <1(10), m_84(20)>
>> >> >> >> > ...
>> >> >> >> >[local count: 912680551]:
>> >> >> >> >   # l_35 = PHI <1(13), l_57(21)>
>> >> >> >> > ...
>> >> >> >> >   y__I_lsm.113_140 = *y_139(D)[_31];
>> >> >> >> >   y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0;
>> >> >> >> > ...
>> >> >> >> >   *y_139(D)[_31] = _101;
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > so we have a COND_EXPR with a test on an integer IV m_58 with
>> >> >> >> > double values.  Note that the m_58 != 1 condition is invariant
>> >> >> >> > in the l loop.
>> >> >> >> >
>> >> >> >> > Currently we vectorize this condition using V8SImo

[arm] Generate a -mfpu= option for passing to the assembler

2017-12-08 Thread Richard Earnshaw (lists)
When gcc runs with the new -mfpu=auto option (either explicitly or when
that's the default behaviour) then this option is not passed through to
the assembler as we cannot rely on the assembler understanding it
(currently it doesn't understand it at all, but in future that might
change).  That means that the assembler falls back to its builtin
default, which may not correspond to what the user expected based on the
command-line options they passed.

Normally that wouldn't matter because assembler files generated by the
compiler will contain explicit directives that set the FPU type directly
and override any internal defaults; but when the compiler driver is used
to invoke the assembler directly (because the source file ends in .s or
.S) then this might cause a problem if that assumes the FPU matches the
compiler.

To address this, this patch makes the driver construct a -mfpu= option
for the assembler in the same way as the compiler generates an internal
.fpu directive.  As mentioned, this makes no difference if the assembler
file explicitly overrides the command line options, but helps in the
case where this is implicit.

2017-06-08  Richard Earnshaw  

* config/arm/arm.h (arm_asm_auto_mfpu): Declare.
(ASM_CPU_SPEC_FUNCTIONS): Add new rule asm_auto_mfpu.
(ASM_CPU_SPEC): Use it if -mfpu is set to auto.
* common/config/arm/arm-common.c (arm_asm_auto_mfpu): New function.
diff --git a/gcc/common/config/arm/arm-common.c b/gcc/common/config/arm/arm-common.c
index 5ae20fe..90b04f1 100644
--- a/gcc/common/config/arm/arm-common.c
+++ b/gcc/common/config/arm/arm-common.c
@@ -823,6 +823,86 @@ arm_be8_option (int argc, const char **argv)
   return "";
 }
 
+/* Generate a -mfpu= option for passing to the assembler.  This is
+   only called when -mfpu was set (possibly defaulted) to auto and is
+   needed to ensure that the assembler knows the correct FPU to use.
+   It wouldn't really be needed except that the compiler can be used
+   to invoke the assembler directly on hand-written files that lack
+   the necessary internal .fpu directives.  We assume that the architecture
+   canonicalization calls have already been made so that we have a final
+   -march= option to derive the fpu from.  */
+const char*
+arm_asm_auto_mfpu (int argc, const char **argv)
+{
+  static char *auto_fpu = NULL;
+  const char *arch = NULL;
+  static const enum isa_feature fpu_bitlist[]
+= { ISA_ALL_FPU_INTERNAL, isa_nobit };
+  const arch_option *selected_arch;
+  static const char* fpuname = "softvfp";
+
+  /* Handle multiple calls to this routine.  */
+  if (auto_fpu)
+{
+  free (auto_fpu);
+  auto_fpu = NULL;
+}
+
+  while (argc)
+{
+  if (strcmp (argv[0], "arch") == 0)
+	arch = argv[1];
+  else
+	fatal_error (input_location,
+		 "unrecognized operand to %%:asm_auto_mfpu");
+  argc -= 2;
+  argv += 2;
+}
+
+  auto_sbitmap target_isa (isa_num_bits);
+  auto_sbitmap fpubits (isa_num_bits);
+
+  gcc_assert (arch != NULL);
+  selected_arch = arm_parse_arch_option_name (all_architectures,
+	  "-march", arch);
+  if (selected_arch == NULL)
+return "";
+
+  arm_initialize_isa (target_isa, selected_arch->common.isa_bits);
+  arm_parse_option_features (target_isa, &selected_arch->common,
+			 strchr (arch, '+'));
+  arm_initialize_isa (fpubits, fpu_bitlist);
+
+  bitmap_and (fpubits, fpubits, target_isa);
+
+  /* The logic below is essentially identical to that in
+ arm.c:arm_identify_fpu_from_isa(), but that only works in the main
+ part of the compiler.  */
+
+  /* If there are no FPU capability bits, we just pass -mfpu=softvfp.  */
+  if (!bitmap_empty_p (fpubits))
+{
+  unsigned int i;
+  auto_sbitmap cand_fpubits (isa_num_bits);
+  for (i = 0; i < TARGET_FPU_auto; i++)
+	{
+	  arm_initialize_isa (cand_fpubits, all_fpus[i].isa_bits);
+	  if (bitmap_equal_p (fpubits, cand_fpubits))
+	{
+	  fpuname = all_fpus[i].name;
+	  break;
+	}
+	}
+
+  gcc_assert (i != TARGET_FPU_auto);
+}
+
+  auto_fpu = (char *) xmalloc (strlen (fpuname) + sizeof ("-mfpu="));
+  strcpy (auto_fpu, "-mfpu=");
+  strcat (auto_fpu, fpuname);
+  return auto_fpu;
+}
+
 #undef ARM_CPU_NAME_LENGTH
 
 
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index b189951..ac51412 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -2166,13 +2166,16 @@ extern int making_const_table;
 
 extern const char *arm_rewrite_mcpu (int argc, const char **argv);
 extern const char *arm_rewrite_march (int argc, const char **argv);
+extern const char *arm_asm_auto_mfpu (int argc, const char **argv);
 #define ASM_CPU_SPEC_FUNCTIONS			\
   { "rewrite_mcpu", arm_rewrite_mcpu },	\
-  { "rewrite_march", arm_rewrite_march },
+  { "rewrite_march", arm_rewrite_march },	\
+  { "asm_auto_mfpu", arm_asm_auto_mfpu },
 
 #define ASM_CPU_SPEC			\
+  " %{mfpu=auto:%

Re: [PR80693] drop value of parallel SETs dropped by combine

2017-12-08 Thread Kyrill Tkachov

Hi Alexandre,

On 07/12/17 21:01, Alexandre Oliva wrote:

On Jul  7, 2017, Segher Boessenkool  wrote:

> I meant, just double check if your new
> code does the correct thing for the set count.

Sorry this took me so long to get back to.  Even this was difficult for
me to answer for sure, then and now.

We don't (and can't?) know whether the REG_UNUSED note originally
pertained to a clobber or a set, but I think that doesn't matter: unless
we've reused the REG in i2 as a scratch, I think we should decrement the
set count, because we used to have a SET or a CLOBBER that is now gone.

Looking back at the issue, I realized we should keep/add the REG_UNUSED
note to i2, if the reg is mentioned in i2, possibly turned into REG_DEAD
if i2 is referenced but not set.

Still, I'm concerned I haven't caught all of the cases in which
adjusting REG_N_SETS would be needed: we might have dropped multiple
SETs of the same pseudo, each with its own REG_UNUSED note (say, from
all of i3, i2, i1, and i0), and the current logic will only decrement
REG_N_SETS once, and only if i2 no longer sets the register.

I'm also concerned that the logic for when the REG is set or referenced
in i3 is incomplete: references in i3 might have come from any of the
previous insns, even if intervening sets of the same register were
substituted and thus removed.  Consider the following nightmarish 
scenario:


i0: (parallel [(set (reg CC) (op1 (reg A)))
   (set (reg A) (plus (reg A) (const_int 1)))])
(REG_UNUSED (reg A))
i1: (set (reg A) (ne (reg CC) (const_int 0)))
(REG_DEAD (reg CC))
i2: (parallel [(set (reg CC) (op2 (reg A)))
   (set (reg A) (plus (reg A) (const_int 1)))])
(REG_UNUSED (reg A)))
i3: (set (reg A) (eq (reg CC) (const_int 0)))
(REG_DEAD (reg CC))

we might turn that into say:

i2: (set (reg CC) (op3 (reg A)))
(REG_DEAD (reg A))
i3: (set (reg A) (op4 (reg CC)))
(REG_DEAD (reg CC))

and now we'd have to somehow figure out that we're to discount the two
unused sets of reg A, those from i0 and i2, and to turn either
REG_UNUSED note about reg A into a REG_DEAD note to be placed at i2.  A
is set at i3, so combine should record its new value, but if it's
computed in terms of the scratch CC and a much-older A, will we get the
correct value?  Or is the value unchanged because it's the output of the
latest insn?

Now, consider this slightly simpler scenario (trying to combine only the
first 3 insns):

i0: nil
i1: (parallel [(set (reg CC) (op1 (reg A)))
   (set (reg A) (plus (reg A) (const_int 1)))])
(REG_UNUSED (reg A))
i2: (set (reg A) (ne (reg CC) (const_int 0)))
(REG_DEAD (reg CC))
i3: (parallel [(set (reg CC) (op2 (reg A)))
   (set (reg A) (plus (reg A) (const_int 1)))])
(REG_UNUSED (reg A)))

this might combine into:

i2: (set (reg A) (op5 (reg A)))
i3: (set (reg CC) (op6 (reg A)))
(REG_DEAD (reg A))

and now we have removed 3 sets to A, but added 1 by splitting within
combine using A as scratch.  Would we then have to figure out that for
each of the REG_UNUSED notes pertaining to A we have to drop the
REG_N_SETS count by 1, although A remains used in i3, and set and used
in i2?  I don't see how.

I see that combine would record the value for reg A at i2 in this case,
but would it express it in terms of which earlier value of reg A?
Shouldn't we have reset it while placing notes in this case too?

> It wasn't obvious to me (this code is horribly complicated).  Whether
> all existing code is correct...  it's probably best not to look too
> closely :-/

You're right about its being horribly complicated.

Maybe I should go about it incrementally.

> If you have a patch you feel confident in, could you post it again
> please?

So let me tell you how I feel about this.  It has waited long enough,
and there are at least 3 bugs known to be fixed by the first very simple
patch below.  The catch is that it doesn't adjust REG_N_SETS at all (we
didn't before the patch, and that didn't seem to hurt too much).  I've
regstrapped this successfully on x86_64-linux-gnu and i686-linux-gnu.

--->cut<---
When combine drops a REG_UNUSED SET in a parallel, we have to clear
cached values, so that, even if the REGs remain used (e.g. because
they were referenced in the used SET_SRC), we will not use properties
of the dropped modified value as if they applied to the preserved
original one.

We fail to adjust REG_N_SETS.

for  gcc/ChangeLog

PR rtl-optimization/80693
PR rtl-optimization/81019
PR rtl-optimization/81020
* combine.c (distribute_notes): Reset any REG_UNUSED REGs that
are not mentioned in i3.  Place the REG_UNUSED note on i2,
possibly modified to REG_DEAD, if it did not originate in i3.

for  gcc/testsuite/ChangeLog

PR rtl-optimization/80693
PR rtl-optimization/81019
PR rtl-optimization/81020
* gcc.dg/pr80693.c: New.
* gcc.dg/pr81019.c: New.
---
 gcc/combine.c  

[PATCH] combine: Fix PR83304

2017-12-08 Thread Segher Boessenkool
In PR83304 two insns are combined, where the I2 uses a register that
has a REG_DEAD note on an insn after I2 but before I3.  In such a case
move_deaths should move that death note.  But move_deaths only looks
at the reg_stat[regno].last_death insn, and that field can be zeroed
out (previously, use_crosses_set_p would prevent the combination in
this case).

If the last_death field is zero it means "unknown", not "no death", so
we have to find if there is a REG_DEAD note.

This patch does that, fixing the Arm PR83304 testcase.  Also bootstrapped
and tested on powerpc64-linux {-m32,-m64}.

Committing to trunk.


Segher


2017-12-08  Segher Boessenkool  

PR rtl-optimization/83304
* combine.c (move_deaths): If we do not know where a register died,
search for it.

---
 gcc/combine.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/gcc/combine.c b/gcc/combine.c
index 748c9a8..b12484a 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -13910,6 +13910,26 @@ move_deaths (rtx x, rtx maybe_kill_insn, int 
from_luid, rtx_insn *to_insn,
   unsigned int regno = REGNO (x);
   rtx_insn *where_dead = reg_stat[regno].last_death;
 
+  /* If we do not know where the register died, it may still die between
+FROM_LUID and TO_INSN.  If so, find it.  This is PR83304.  */
+  if (!where_dead)
+   {
+ rtx_insn *insn = prev_real_insn (to_insn);
+ while (insn
+&& BLOCK_FOR_INSN (insn) == BLOCK_FOR_INSN (to_insn)
+&& DF_INSN_LUID (insn) >= from_luid)
+   {
+ if (dead_or_set_regno_p (insn, regno))
+   {
+ if (find_regno_note (insn, REG_DEAD, regno))
+   where_dead = insn;
+ break;
+   }
+
+ insn = prev_real_insn (insn);
+   }
+   }
+
   /* Don't move the register if it gets killed in between from and to.  */
   if (maybe_kill_insn && reg_set_p (x, maybe_kill_insn)
  && ! reg_referenced_p (x, maybe_kill_insn))
-- 
1.8.3.1



[arm] Don't strip off all architecture features from -march passed to assembler

2017-12-08 Thread Richard Earnshaw (lists)
When GCC invokes the assembler it generates a sanitized version of the
user-specified -march option to pass through, since the assembler does
not understand all the new FPU-related architectural options.
Unfortunately it goes too far and strips off all the architectural
extensions, including some that are unrelated to the -mfpu variant
selected.

Again, this doesn't really matter when compiling C code because the
compiler will override the command-line specified architecture with
directives in the assembly file itself, but when using the compiler
driver to invoke the assembler the only indiciation of the desired
architecture might come from the command line.

We fix this by adjusting the canonicalization pass to remove any
option that only specifies features that can be expressed by -mfpu
(any that go beyond that are already supported by the assembler).  We
do have to take care to re-order the options, though as the assembler
expects feature options to be in a canonical order (unlike the
compiler, where ordering is handled left-to-right: there's only a
difference if there are negation options, but a canonicalized
architecture string shouldn't have any of those).  We do this by
recording which options we need and then sorting the final list
alphabetically.

* common/config/arm/arm-common.c: Include .
(INCLUDE_VECTOR): Define.
(compare_opt_names): New function.
(arm_rewrite_selected_arch): Only strip out extensions that can be
expressed through -mfpu.  Sort the remaining extensions
alphabetically.

diff --git a/gcc/common/config/arm/arm-common.c b/gcc/common/config/arm/arm-common.c
index 90b04f1618e39441c2e7395bbea2487f49a3170c..d6374276a109cb8bc0dbe8640af4accc57a81496 100644
--- a/gcc/common/config/arm/arm-common.c
+++ b/gcc/common/config/arm/arm-common.c
@@ -18,6 +18,7 @@
.  */
 
 #define INCLUDE_LIST
+#define INCLUDE_VECTOR
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -30,6 +31,7 @@
 #include "flags.h"
 #include "sbitmap.h"
 #include "diagnostic.h"
+#include 
 
 /* Set default optimization options.  */
 static const struct default_options arm_option_optimization_table[] =
@@ -114,26 +116,99 @@ arm_rewrite_mcpu (int argc, const char **argv)
   return arm_rewrite_selected_cpu (argv[argc - 1]);
 }
 
-/* Truncate NAME at the first '+' character seen, or return
-   NAME unmodified.  Similar to arm_rewrite_selected_cpu, but we must
-   preserve '.' as that is part of some architecture names.  */
+static bool
+compare_opt_names (const char *first, const char *second)
+{
+  for (int i = 0; ; i++)
+if (first[i] == 0
+	|| first[i] < second[i])
+  return true;
+  return false;
+}
 
+/* Rewrite the architecture string for passing to the assembler.
+   Although the syntax is similar we cannot assume that it supports
+   the newer FP related options.  So strip any option that only
+   defines features in the standard -mfpu options out.  We'll generate
+   a suitable -mfpu option elsewhere to carry that information.  NAME
+   should already have been canonicalized, so we do not expect to
+   encounter +no.. options that remove features.  A final problem is
+   that the assembler expects the feature extensions to be listed
+   alphabetically, so we build a list of required options and then
+   sort them into canonical order in the resulting string.  */
 const char *
 arm_rewrite_selected_arch (const char *name)
 {
-  static char output_buf[ARM_CPU_NAME_LENGTH + 1] = {0};
-  char *arg_pos;
+  /* The result we return needs to be semi persistent, so handle being
+ re-invoked.  */
+  static char *asm_arch = NULL;
 
-  strncpy (output_buf, name, ARM_CPU_NAME_LENGTH);
-  output_buf[ARM_CPU_NAME_LENGTH] = 0;
+  if (asm_arch)
+{
+  free (asm_arch);
+  asm_arch = NULL;
+}
 
-  arg_pos = strchr (output_buf, '+');
+  const char *arg_pos = strchr (name, '+');
 
-  /* If we found a '+' truncate the entry at that point.  */
-  if (arg_pos)
-*arg_pos = '\0';
+  /* No extension options? just return the original string.  */
+  if (arg_pos == NULL)
+return name;
 
-  return output_buf;
+  const arch_option *arch_opt
+= arm_parse_arch_option_name (all_architectures, "-march", name);
+
+  auto_sbitmap fpu_bits (isa_num_bits);
+  static const enum isa_feature fpu_bitlist[]
+= { ISA_ALL_FPU_INTERNAL, isa_nobit };
+
+  arm_initialize_isa (fpu_bits, fpu_bitlist);
+
+  auto_sbitmap opt_bits (isa_num_bits);
+
+  /* Ensure that the resulting string is large enough for the result.  We
+ never add options, so using strdup here will ensure that.  */
+  asm_arch = xstrdup (name);
+  asm_arch[arg_pos - name] = '\0';
+
+  std::vectoroptlist;
+
+  while (arg_pos)
+{
+  const char *end = strchr (arg_pos + 1, '+');
+  size_t len = end ? end - arg_pos : strlen (arg_pos);
+
+  for (const cpu_arch_extension *entry = arch_opt->common.extensions;
+	   entry->name != NULL;
+	   entry++)
+	{
+	  

Re: [PATCH][RFA][P1 PR tree-optimization/83298] Avoid over-optimistic result range for COND_EXPR

2017-12-08 Thread Richard Biener
On Fri, Dec 8, 2017 at 1:18 AM, Jeff Law  wrote:
>
> So the underlying issue here is quite simple.  Given something like
>
> x = (cond) ? res1 : res2;
>
> EVRP analysis will compute the resultant range using vrp_meet of the
> ranges for res1 and res2.  Seems pretty natural.
>
> vrp_meet makes optimistic assumptions if either range is VR_UNDEFINED
> and will set the resultant range to the range of the other operand.
>
> Some callers explicitly mention this is the desired behavior (PHI
> processing).  Other callers avoid calling vrp_meet when one of the
> ranges is VR_UNDEFINED and do something sensible
> (extract_range_from_unary_expr, extract_range_from_binary_expr_1).
>
> extract_range_from_cond_expr neither mentions that it wants the
> optimistic behavior nor does it avoid calling vrp_meet with a
> VR_UNDEFINED range.  It naturally seems to fit in better with the other
> extract_range_from_* routines.
>
> I'm not at all familiar with the ipa-cp bits, but from a quick look they
> also seems to fall into the extract_* camp.
>
>
> Anyway, normally in a domwalk the only place where we're going to see
> VR_UNDEFINED would be in the PHI nodes.  It's one of the nice properties
> of a domwalk :-)
>
> However, for jump threading we look past the dominance frontier;
> furthermore, we do not currently record ranges for statements we process
> as part of the jump threading.  But we do try to extract the range each
> statement generates -- we're primarily looking for cases where the
> statement generates a singleton range.
>
> While the plan does include recording ranges as we look past the
> dominance frontier, I strongly believe some serious code cleanup in DOM
> and jump threading needs to happen first.  So I don't want to go down
> that path for gcc-8.
>
> So we're kind-of stuck with the fact that we might query for a resultant
> range when one or more input operands may not have recorded range
> information.  Thankfully that's easily resolved by making
> extract_range_from_cond_expr work like the other range extraction
> routines and avoid calling vrp_meet when one or more operands is
> VR_UNDEFINED.
>
> Bootstrapped and regression tested on x86_64.  OK for the trunk?

But that does regress the case of either arm being an uninitialized
variable.

I'm not convinced that when you look forward past the dominance frontier
and do VRP analysis on stmts without analyzing all intermediate
stmts on the path (or at least push all defs on that path temporarily
to VR_VARYING) is fixed by this patch.  It merely looks like a wrong
workaround for a fundamental issue in how DOM now uses the
interface?

Thanks,
Richard.

> Jeff
>
>
> PR tree-optimization/83298
> * vr-values.c (vr_values::extract_range_from_cond_expr): Do not
> call vrp_meet if one of the input operands is VR_UNDEFINED.
>
> PR tree-optimization/83298
> * gcc.c-torture/execute/pr83298.c: New test.
>
> diff --git a/gcc/testsuite/gcc.c-torture/execute/pr83298.c 
> b/gcc/testsuite/gcc.c-torture/execute/pr83298.c
> new file mode 100644
> index 000..0e51ababf5c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/execute/pr83298.c
> @@ -0,0 +1,11 @@
> +
> +int a, b, c = 1;
> +
> +int main ()
> +{
> +  for (; b < 1; b++)
> +;
> +  if (!(c * (a < 1)))
> +__builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/vr-values.c b/gcc/vr-values.c
> index 9352e120d9d..ee5ae3c6a27 100644
> --- a/gcc/vr-values.c
> +++ b/gcc/vr-values.c
> @@ -912,6 +912,23 @@ vr_values::extract_range_from_cond_expr (value_range 
> *vr, gassign *stmt)
>else
>  set_value_range_to_varying (&vr1);
>
> +  /* If either range is VR_UNDEFINED, vrp_meet will make the optimistic
> + choice and use the range of the other operand as the result range.
> +
> + Other users of vrp_meet either explicitly filter the calls for
> + this case, or they do not care (PHI processing where unexecutable
> + edges are explicitly expected to be ignored).
> +
> + Like most other callers, we can not generally tolerate the optimistic
> + choice here.  Consider jump threading where we're looking into a
> + non-dominated block and thus may not necessarily have processed the
> + ranges for statements within that non-dominated block.  */
> +  if (vr0.type == VR_UNDEFINED || vr1.type == VR_UNDEFINED)
> +{
> +  set_value_range_to_varying (vr);
> +  return;
> +}
> +
>/* The resulting value range is the union of the operand ranges */
>copy_value_range (vr, &vr0);
>vrp_meet (vr, &vr1);
>


RE: [PATCH][GCC][ARM] Generate .arch and .arch_extensions for each function if required. [Patch (3/3)]

2017-12-08 Thread Tamar Christina
Hi Christoph,

> >> > gcc/testsuite/
> >> > 2017-11-06  Tamar Christina  
> >> >
> >> > PR target/82641
> >> > * gcc.target/arm/pragma_arch_attribute_2.c: New.
> >> > * gcc.target/arm/pragma_arch_attribute_2.c: New.
> >> > * gcc.target/arm/pragma_arch_attribute_3.c: New.
> >> > * gcc.target/arm/pragma_fpu_attribute.c: New.
> >> > * gcc.target/arm/pragma_fpu_attribute_2.c: New.
> >> >
> >> > --
> 
> I'm afraid you'll have to update the testcases: they fail on non-hf targets
> (arm-none-linux-gnueabi, arm-none-eabi), because:
> In file included from /gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c:7:
> /aci-gcc-fsf/builds/gcc-fsf-gccsrc/obj-arm-none-linux-
> gnueabi/gcc3/gcc/include/arm_neon.h:31:2:
> error: #error "NEON intrinsics not available with the soft-float ABI.
> Please use -mfloat-abi=softfp or -mfloat-abi=hard"
> /gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c:11:53: error:
> unknown type name 'poly64x1_t'
> /gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c:11:71: error:
> unknown type name 'poly64x1_t'

Well this is incredibly frustrating. I tested these configurations repeatedly 
locally:

Test Run By tnfchris on Thu Dec  7 13:39:54 2017

   
Target is arm-none-eabi
Host   is x86_64-pc-linux-gnu

/d/t/g/s/gcc (arm-implement-pragma-arch-verify ↩☡) grep 
"pragma_fpu_attribute\.c" ../../build-arm-none-eabi/results/vanilla/gcc.sum
PASS: gcc.target/arm/pragma_fpu_attribute.c (test for excess errors)
PASS: gcc.target/arm/pragma_fpu_attribute.c scan-assembler-times 
\\.fpu\\s+crypto-neon-fp-armv8 1
PASS: gcc.target/arm/pragma_fpu_attribute.c scan-assembler-times 
\\.fpu\\s+vfpv3-d16 1
PASS: gcc.target/arm/pragma_fpu_attribute.c (test for excess errors)
PASS: gcc.target/arm/pragma_fpu_attribute.c scan-assembler-times 
\\.fpu\\s+crypto-neon-fp-armv8 1
PASS: gcc.target/arm/pragma_fpu_attribute.c scan-assembler-times 
\\.fpu\\s+vfpv3-d16 1

--

Test Run By tnfchris on Thu Dec  7 11:08:29 2017
Native configuration is arm-none-linux-gnueabihf

tnfchris@native:~/gcc-arm$ grep "pragma_fpu_attribute\.c" 
./gcc/testsuite/gcc/gcc.sum
PASS: gcc.target/arm/pragma_fpu_attribute.c (test for excess errors)
PASS: gcc.target/arm/pragma_fpu_attribute.c scan-assembler-times 
\\.fpu\\s+crypto-neon-fp-armv8 1
PASS: gcc.target/arm/pragma_fpu_attribute.c scan-assembler-times 
\\.fpu\\s+vfpv3-d16 1

So I'm quite surprised about this. In any case, I will look into it.

Thanks,
Tamar


> 
> Looking at other attributes tests, maybe you need to add arm_neon_ok?
> 
> Thanks,
> Christophe


Re: [PATCH][RFA][P1 PR tree-optimization/83298] Avoid over-optimistic result range for COND_EXPR

2017-12-08 Thread Richard Biener
On Fri, Dec 8, 2017 at 1:18 AM, Jeff Law  wrote:
>
> So the underlying issue here is quite simple.  Given something like
>
> x = (cond) ? res1 : res2;
>
> EVRP analysis will compute the resultant range using vrp_meet of the
> ranges for res1 and res2.  Seems pretty natural.
>
> vrp_meet makes optimistic assumptions if either range is VR_UNDEFINED
> and will set the resultant range to the range of the other operand.
>
> Some callers explicitly mention this is the desired behavior (PHI
> processing).  Other callers avoid calling vrp_meet when one of the
> ranges is VR_UNDEFINED and do something sensible
> (extract_range_from_unary_expr, extract_range_from_binary_expr_1).

Note that "those" simply optimize the VR_UNDEFINED case by ignoring
the range.  They basically take a shortcut around vrp_meet and avoid
calling extract_range_* on VR_UNDEFINED ranges.

> extract_range_from_cond_expr neither mentions that it wants the
> optimistic behavior nor does it avoid calling vrp_meet with a
> VR_UNDEFINED range.  It naturally seems to fit in better with the other
> extract_range_from_* routines.
>
> I'm not at all familiar with the ipa-cp bits, but from a quick look they
> also seems to fall into the extract_* camp.
>
>
> Anyway, normally in a domwalk the only place where we're going to see
> VR_UNDEFINED would be in the PHI nodes.  It's one of the nice properties
> of a domwalk :-)
>
> However, for jump threading we look past the dominance frontier;
> furthermore, we do not currently record ranges for statements we process
> as part of the jump threading.  But we do try to extract the range each
> statement generates -- we're primarily looking for cases where the
> statement generates a singleton range.
>
> While the plan does include recording ranges as we look past the
> dominance frontier, I strongly believe some serious code cleanup in DOM
> and jump threading needs to happen first.  So I don't want to go down
> that path for gcc-8.
>
> So we're kind-of stuck with the fact that we might query for a resultant
> range when one or more input operands may not have recorded range
> information.  Thankfully that's easily resolved by making
> extract_range_from_cond_expr work like the other range extraction
> routines and avoid calling vrp_meet when one or more operands is
> VR_UNDEFINED.
>
> Bootstrapped and regression tested on x86_64.  OK for the trunk?
>
> Jeff
>
>
> PR tree-optimization/83298
> * vr-values.c (vr_values::extract_range_from_cond_expr): Do not
> call vrp_meet if one of the input operands is VR_UNDEFINED.
>
> PR tree-optimization/83298
> * gcc.c-torture/execute/pr83298.c: New test.
>
> diff --git a/gcc/testsuite/gcc.c-torture/execute/pr83298.c 
> b/gcc/testsuite/gcc.c-torture/execute/pr83298.c
> new file mode 100644
> index 000..0e51ababf5c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/execute/pr83298.c
> @@ -0,0 +1,11 @@
> +
> +int a, b, c = 1;
> +
> +int main ()
> +{
> +  for (; b < 1; b++)
> +;
> +  if (!(c * (a < 1)))
> +__builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/vr-values.c b/gcc/vr-values.c
> index 9352e120d9d..ee5ae3c6a27 100644
> --- a/gcc/vr-values.c
> +++ b/gcc/vr-values.c
> @@ -912,6 +912,23 @@ vr_values::extract_range_from_cond_expr (value_range 
> *vr, gassign *stmt)
>else
>  set_value_range_to_varying (&vr1);
>
> +  /* If either range is VR_UNDEFINED, vrp_meet will make the optimistic
> + choice and use the range of the other operand as the result range.
> +
> + Other users of vrp_meet either explicitly filter the calls for
> + this case, or they do not care (PHI processing where unexecutable
> + edges are explicitly expected to be ignored).
> +
> + Like most other callers, we can not generally tolerate the optimistic
> + choice here.  Consider jump threading where we're looking into a
> + non-dominated block and thus may not necessarily have processed the
> + ranges for statements within that non-dominated block.  */
> +  if (vr0.type == VR_UNDEFINED || vr1.type == VR_UNDEFINED)
> +{
> +  set_value_range_to_varying (vr);
> +  return;
> +}
> +
>/* The resulting value range is the union of the operand ranges */
>copy_value_range (vr, &vr0);
>vrp_meet (vr, &vr1);
>


[arm] PR target/83206: Make native driver select fp-capable armv6 cores

2017-12-08 Thread Richard Earnshaw (lists)
A quirk in the historical naming of some ARMv6 products means that the
main CPU name implies the presence or otherwise of the floating point unit.
This causes problems when using -mfpu=auto with -mcpu=native: the driver is
picking a CPU that does not support a floating-point unit, even though
one may well exist.

This patch addresses this by selecting the FP-capable names so that FP
instructions will be generated if the other options suggest this is
permitted.

Note that a more complete fix is really needed here to look up the
FP/simd capabilities and append the appropriate capability extensions.
This will be the subject of some follow-up patches.

* config/arm/driver-arm.c (arm_cpu_table): Use fp-capable product names
for armv6 ARM CPU IDs.
diff --git a/gcc/config/arm/driver-arm.c b/gcc/config/arm/driver-arm.c
index 5c29b94caaba4ff6f89a191f1d8edcf10431c0b3..86212315f13e36e35d850cae624e17db464db8d9 100644
--- a/gcc/config/arm/driver-arm.c
+++ b/gcc/config/arm/driver-arm.c
@@ -33,12 +33,12 @@ static struct vendor_cpu arm_cpu_table[] = {
 {"0x926", "armv5te", "arm926ej-s"},
 {"0xa26", "armv5te", "arm1026ej-s"},
 {"0xb02", "armv6k", "mpcore"},
-{"0xb36", "armv6j", "arm1136j-s"},
-{"0xb56", "armv6t2", "arm1156t2-s"},
+{"0xb36", "armv6j", "arm1136jf-s"},
+{"0xb56", "armv6t2", "arm1156t2f-s"},
 /* armv6kz is the correct spelling for ARMv6KZ but may not be supported in
the version of binutils used.  The incorrect spelling is supported in
legacy and current binutils so that is used instead.  */
-{"0xb76", "armv6zk", "arm1176jz-s"},
+{"0xb76", "armv6zk", "arm1176jzf-s"},
 {"0xc05", "armv7-a", "cortex-a5"},
 {"0xc07", "armv7ve", "cortex-a7"},
 {"0xc08", "armv7-a", "cortex-a8"},


[PATCH] Add -fopt-info-loop support for interchange.

2017-12-08 Thread Richard Biener

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2017-12-08  Richard Biener  

* gimple-loop-interchange.cc (tree_loop_interchange::interchange):
Provide -fopt-info-loop feedback when we interchange in a nest.

Index: gcc/gimple-loop-interchange.cc
===
--- gcc/gimple-loop-interchange.cc  (revision 255499)
+++ gcc/gimple-loop-interchange.cc  (working copy)
@@ -1550,6 +1550,7 @@ bool
 tree_loop_interchange::interchange (vec datarefs,
vec ddrs)
 {
+  location_t loc = find_loop_location (m_loop_nest[0]);
   bool changed_p = false;
   /* In each iteration we try to interchange I-th loop with (I+1)-th loop.
  The overall effect is to push inner loop to outermost level in whole
@@ -1597,8 +1598,12 @@ tree_loop_interchange::interchange (vec<
 oloop.m_loop->num, iloop.m_loop->num);
}
 }
-
   simple_dce_from_worklist (m_dce_seeds);
+
+  if (changed_p)
+dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
+"loops interchanged in loop nest\n");
+
   return changed_p;
 }
 


Fix ICE on overflow of profile-count

2017-12-08 Thread Jan Hubicka
Hi,
the testcase triggers ICE because loop iterates too many times.  We used to
work around by capping all counts to 1 but that leads to many problems
during IPA profile propagation (because there is no way to determine
frequency of call given frequency of entry when entry frequency is 0).

This patch adds capping.  It seems very rare case and profile is either
already wrong or the program would run just too long to finish.

Bootstrapped/regtested x86_64-linux, comitted.

Honza

* profile-count.c (profile_count::from_gcov_type): Move from
profile-count.h; handle overflow.
* profile-count. (profile_count::from_gcov_type): Move offline.

PR middle-end/83609
* gcc.c-torture/compile/pr83069.c: New testcase.
Index: profile-count.c
===
--- profile-count.c (revision 255466)
+++ profile-count.c (working copy)
@@ -327,3 +327,21 @@ profile_count::combine_with_ipa_count (p
 return this->global0 ();
   return this->global0adjusted ();
 }
+
+/* The profiling runtime uses gcov_type, which is usually 64bit integer.
+   Conversions back and forth are used to read the coverage and get it
+   into internal representation.  */
+profile_count
+profile_count::from_gcov_type (gcov_type v)
+  {
+profile_count ret;
+gcc_checking_assert (v >= 0);
+if (dump_file && v >= (gcov_type)max_count)
+  fprintf (dump_file,
+  "Capping gcov count %" PRId64 " to max_count %" PRId64 "\n",
+  (int64_t) v, (int64_t) max_count);
+ret.m_val = MIN (v, (gcov_type)max_count);
+ret.m_quality = profile_precise;
+return ret;
+  }
+
Index: profile-count.h
===
--- profile-count.h (revision 255466)
+++ profile-count.h (working copy)
@@ -667,18 +667,6 @@ public:
   return c;
 }
 
-  /* The profiling runtime uses gcov_type, which is usually 64bit integer.
- Conversions back and forth are used to read the coverage and get it
- into internal representation.  */
-  static profile_count from_gcov_type (gcov_type v)
-{
-  profile_count ret;
-  gcc_checking_assert (v >= 0 && (uint64_t) v <= max_count);
-  ret.m_val = v;
-  ret.m_quality = profile_precise;
-  return ret;
-}
-
   /* Conversion to gcov_type is lossy.  */
   gcov_type to_gcov_type () const
 {
@@ -1083,6 +1071,11 @@ public:
  global0.  */
   profile_count combine_with_ipa_count (profile_count ipa);
 
+  /* The profiling runtime uses gcov_type, which is usually 64bit integer.
+ Conversions back and forth are used to read the coverage and get it
+ into internal representation.  */
+  static profile_count from_gcov_type (gcov_type v);
+
   /* LTO streaming support.  */
   static profile_count stream_in (struct lto_input_block *);
   void stream_out (struct output_block *);
Index: testsuite/gcc.c-torture/compile/pr83069.c
===
--- testsuite/gcc.c-torture/compile/pr83069.c   (revision 0)
+++ testsuite/gcc.c-torture/compile/pr83069.c   (working copy)
@@ -0,0 +1,14 @@
+#define MAX 98
+
+void foo (unsigned long *res, unsigned long in)
+{
+  for (unsigned long a = 0; a < MAX; a++)
+for (unsigned long b = 0; b < MAX; b++)
+  for (unsigned long c = 0; c < MAX; c++)
+for (unsigned long d = 0; d < MAX; d++)
+  for (unsigned long e = 0; e < MAX; e++)
+for (unsigned long f = 0; f < MAX; f++)
+  for (unsigned long g = 0; g < MAX; g++)
+*res += a * in;
+}
+


Re: [PATCH 5/3] C++ bits to improve detection of attribute conflicts (PR 81544)

2017-12-08 Thread Jakub Jelinek
On Fri, Dec 08, 2017 at 11:32:54AM +0100, Rainer Orth wrote:
> Jakub Jelinek  writes:
> 
> > On Fri, Dec 08, 2017 at 10:43:58AM +0100, Rainer Orth wrote:
> >> The line numbers are completely misleading, unfortunately.  Hadn't
> >> SUBTARGET_ATTRIBUTE_TABLE been used at the end of the (very short)
> >> sparc_attribute_table, I wouldn't have seen what was wrong.
> >> 
> >> The following patch fixes the problem, installed on mainline after
> >> i386-pc-solaris2.11 and sparc-sun-solaris2.11 bootstraps completed
> >> without regressions.
> >> 
> >> However, there are two more SUBTARGET_ATTRIBUTE_TABLE defines:
> >> 
> >> gcc/config/darwin.h:#define SUBTARGET_ATTRIBUTE_TABLE \
> >> gcc/config/i386/cygming.h:#define SUBTARGET_ATTRIBUTE_TABLE \
> >
> > I'll deal with this.
> 
> This is what I'm currently testing with an  x86_64-apple-darwin11.4.2
> bootstrap.  Ok if it passes?

I've committed this instead, while only darwin.h and cygming.h were left
broken, the comments haven't been adjusted in many more cases.

BTW, we should swap handler and affects_type_identity fields at some point,
3 bools, then pointer, then bool, then pointer needs quite a lot of padding.

2017-12-08  Jakub Jelinek  

* config/arc/arc.c (arc_attribute_table): Add exclusions to
the comment.
* config/avr/avr.c (avr_attribute_table): Likewise.
* config/msp430/msp430.c (msp430_attribute_table): Likewise.
* config/rl78/rl78.c (rl78_attribute_table): Likewise.
* config/nds32/nds32.c (nds32_attribute_table): Likewise.
* config/darwin.h (SUBTARGET_ATTRIBUTE_TABLE): Initialize new member
of struct attribute_spec.
* config/i386/cygming.h (SUBTARGET_ATTRIBUTE_TABLE): Likewise.
ada/
* gcc-interface/utils.c (gnat_internal_attribute_table): Add
exclusions to the comment.
brig/
* brig-lang.c (brig_attribute_table): Fix up comment.

--- gcc/config/arc/arc.c.jj 2017-12-07 18:05:03.0 +0100
+++ gcc/config/arc/arc.c2017-12-08 11:20:24.605501619 +0100
@@ -218,7 +218,7 @@ static tree arc_handle_fndecl_attribute
 const struct attribute_spec arc_attribute_table[] =
 {
  /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
-  affects_type_identity } */
+  affects_type_identity, exclusions } */
   { "interrupt", 1, 1, true, false, false, arc_handle_interrupt_attribute,
   true, NULL },
   /* Function calls made to this symbol must be done indirectly, because
--- gcc/config/avr/avr.c.jj 2017-12-07 18:05:02.0 +0100
+++ gcc/config/avr/avr.c2017-12-08 11:24:32.270391537 +0100
@@ -9875,7 +9875,7 @@ static const struct attribute_spec
 avr_attribute_table[] =
 {
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
-   affects_type_identity } */
+   affects_type_identity, exclusions } */
   { "progmem",   0, 0, false, false, false,  avr_handle_progmem_attribute,
 false, NULL },
   { "signal",0, 0, true,  false, false,  avr_handle_fndecl_attribute,
--- gcc/config/darwin.h.jj  2017-11-28 12:11:41.0 +0100
+++ gcc/config/darwin.h 2017-12-08 11:18:05.866243854 +0100
@@ -741,11 +741,11 @@ extern GTY(()) section * darwin_sections
 /* Extra attributes for Darwin.  */
 #define SUBTARGET_ATTRIBUTE_TABLE   \
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler, \
-   affects_type_identity } */  
 \
+   affects_type_identity, exclusions } */   \
   { "apple_kext_compatibility", 0, 0, false, true, false,   \
-darwin_handle_kext_attribute, false },  \
+darwin_handle_kext_attribute, false, NULL },\
   { "weak_import", 0, 0, true, false, false,\
-darwin_handle_weak_import_attribute, false }
+darwin_handle_weak_import_attribute, false, NULL }
 
 /* Make local constant labels linker-visible, so that if one follows a
weak_global constant, ld64 will be able to separate the atoms.  */
--- gcc/config/i386/cygming.h.jj2017-10-30 12:02:35.0 +0100
+++ gcc/config/i386/cygming.h   2017-12-08 11:25:12.372887945 +0100
@@ -448,9 +448,9 @@ do {\
 
 #define SUBTARGET_ATTRIBUTE_TABLE \
   { "selectany", 0, 0, true, false, false, ix86_handle_selectany_attribute, \
-false }
+false, NULL }
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
-   affects_type_identity } */
+   affects_type_identity, exclusions } */
 
 /*  mcount() does not need a counter variable.  */
 #undef NO_PROFILE_COUNTERS
--- gcc/config/msp430/msp430.c.jj   2017-12-07 18:05:03.0 +0100
+++ gcc/config/msp430/msp430.c  2017-12-08 11:26:47.634691684 +0100
@@ -2050,7 +2050,7 @@ msp430_data_attr (tree * node,
 const struct attribute_spec msp430_attri

[spu, commit] Fix PR target/82960

2017-12-08 Thread Ulrich Weigand
Hello,

the ICE with --enable-checking=rtl reported in PR 82960 was caused by
spu.c:pad_bb using INSN_CODE on RTX with INSN_P false (specifically,
on jump_table_data).  Add checks to handle this case.

Tested on spu-elf, committed to mainline.

Bye,
Ulrich

ChangeLog:

PR target/82960
* config/spu/spu.c (pad_bb): Only check INSN_CODE when INSN_P is true.

Index: gcc/config/spu/spu.c
===
*** gcc/config/spu/spu.c(revision 255408)
--- gcc/config/spu/spu.c(working copy)
*** pad_bb(void)
*** 2029,2036 
for (; insn; insn = next_insn)
  {
next_insn = next_active_insn (insn);
!   if (INSN_CODE (insn) == CODE_FOR_iprefetch
! || INSN_CODE (insn) == CODE_FOR_hbr)
{
  if (hbr_insn)
{
--- 2029,2037 
for (; insn; insn = next_insn)
  {
next_insn = next_active_insn (insn);
!   if (INSN_P (insn)
!   && (INSN_CODE (insn) == CODE_FOR_iprefetch
! || INSN_CODE (insn) == CODE_FOR_hbr))
{
  if (hbr_insn)
{
*** pad_bb(void)
*** 2048,2054 
}
  hbr_insn = insn;
}
!   if (INSN_CODE (insn) == CODE_FOR_blockage && next_insn)
{
  if (GET_MODE (insn) == TImode)
PUT_MODE (next_insn, TImode);
--- 2049,2055 
}
  hbr_insn = insn;
}
!   if (INSN_P (insn) && INSN_CODE (insn) == CODE_FOR_blockage && next_insn)
{
  if (GET_MODE (insn) == TImode)
PUT_MODE (next_insn, TImode);
-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  ulrich.weig...@de.ibm.com



[PATCH GCC]More conservative interchanging small loops with const initialized simple reduction

2017-12-08 Thread Bin Cheng
Hi,
This simple patch makes interchange even more conservative for small loops with 
constant initialized simple reduction.
The reason is undoing such reduction introduces new data reference and 
cond_expr, which could cost too much in a small
loop.
Test gcc.target/aarch64/pr62178.c is fixed with this patch.  Is it OK if test 
passes?

Thanks,
bin
2017-12-08  Bin Cheng  

* gimple-loop-interchange.cc (struct loop_cand): New field.
(loop_cand::loop_cand): Init new field in constructor.
(loop_cand::classify_simple_reduction): Record simple reduction
initialized with constant value.
(should_interchange_loops): New parameter.  Skip interchange if loop
has few data references and constant intitialized simple reduction.
(tree_loop_interchange::interchange): Update call to above function.
(should_interchange_loop_nest): Ditto.diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
index 6554a42..f45f7dc 100644
--- a/gcc/gimple-loop-interchange.cc
+++ b/gcc/gimple-loop-interchange.cc
@@ -199,13 +199,16 @@ struct loop_cand
   edge m_exit;
   /* Basic blocks of this loop.  */
   basic_block *m_bbs;
+  /* Number of constant initialized simple reduction.  */
+  unsigned m_num_const_init_simple_reduc;
 };
 
 /* Constructor.  */
 
 loop_cand::loop_cand (struct loop *loop, struct loop *outer)
   : m_loop (loop), m_outer (outer),
-m_exit (single_exit (loop)), m_bbs (get_loop_body (loop))
+m_exit (single_exit (loop)), m_bbs (get_loop_body (loop)),
+m_num_const_init_simple_reduc (0)
 {
 m_inductions.create (3);
 m_reductions.create (3);
@@ -440,7 +443,9 @@ loop_cand::classify_simple_reduction (reduction_p re)
 
   re->init_ref = gimple_assign_rhs1 (producer);
 }
-  else if (!CONSTANT_CLASS_P (re->init))
+  else if (CONSTANT_CLASS_P (re->init))
+m_num_const_init_simple_reduc++;
+  else
 return;
 
   /* Check how reduction variable is used.  */
@@ -1422,6 +1427,7 @@ dump_access_strides (vec datarefs)
 static bool
 should_interchange_loops (unsigned i_idx, unsigned o_idx,
  vec datarefs,
+ unsigned num_const_init_simple_reduc,
  bool innermost_loops_p, bool dump_info_p = true)
 {
   unsigned HOST_WIDE_INT ratio;
@@ -1522,6 +1528,12 @@ should_interchange_loops (unsigned i_idx, unsigned o_idx,
   if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0)
 return false;
 
+  /* Conservatively skip interchange in cases only have few data references
+ and constant initialized simple reduction since it introduces new data
+ reference as well as ?: operation.  */
+  if (num_old_inv_drs + num_const_init_simple_reduc * 2 >= datarefs.length ())
+return false;
+
   /* We use different stride comparison ratio for interchanging innermost
  two loops or not.  The idea is to be conservative in interchange for
  the innermost loops.  */
@@ -1576,6 +1588,7 @@ tree_loop_interchange::interchange (vec 
datarefs,
 
   /* Check profitability for loop interchange.  */
   if (should_interchange_loops (i_idx, o_idx, datarefs,
+   iloop.m_num_const_init_simple_reduc,
iloop.m_loop->inner == NULL))
{
  if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1764,7 +1779,7 @@ should_interchange_loop_nest (struct loop *loop_nest, 
struct loop *innermost,
   /* Check if any two adjacent loops should be interchanged.  */
   for (struct loop *loop = innermost;
loop != loop_nest; loop = loop_outer (loop), idx--)
-if (should_interchange_loops (idx, idx - 1, datarefs,
+if (should_interchange_loops (idx, idx - 1, datarefs, 0,
  loop == innermost, false))
   return true;
 


[PATCH][i386] Fix PR83008

2017-12-08 Thread Richard Biener

This restores the vec_construct cost dependence on the vector element
count.  Honza removed this (accidentially?) during the rework.

Bootstrap / regtest running on x86_64-unknown-linux-gnu, ok for trunk?

Thanks,
Richard.

2017-12-08  Richard Biener  

PR target/83008
* config/i386/i386.c (ix86_builtin_vectorization_cost): Restore
vec_construct dependence on vector element count.

Index: gcc/config/i386/i386.c
===
--- gcc/config/i386/i386.c  (revision 255499)
+++ gcc/config/i386/i386.c  (working copy)
@@ -44879,7 +44879,8 @@ ix86_builtin_vectorization_cost (enum ve
  ix86_cost->sse_op, true);
 
   case vec_construct:
-   return ix86_vec_cost (mode, ix86_cost->sse_op, false);
+   return (ix86_vec_cost (mode, ix86_cost->sse_op, false)
+   * (TYPE_VECTOR_SUBPARTS (vectype) - 1));
 
   default:
 gcc_unreachable ();


Re: [PATCH GCC]More conservative interchanging small loops with const initialized simple reduction

2017-12-08 Thread Richard Biener
On Fri, Dec 8, 2017 at 12:46 PM, Bin Cheng  wrote:
> Hi,
> This simple patch makes interchange even more conservative for small loops 
> with constant initialized simple reduction.
> The reason is undoing such reduction introduces new data reference and 
> cond_expr, which could cost too much in a small
> loop.
> Test gcc.target/aarch64/pr62178.c is fixed with this patch.  Is it OK if test 
> passes?

Shouldn't we do this even for non-constant initialzied simple
reduction?  Because for any simple
reduction we add two DRs that are not innermost, for constant
initialized we add an additional
cond-expr.  So ...

+  /* Conservatively skip interchange in cases only have few data references
+ and constant initialized simple reduction since it introduces new data
+ reference as well as ?: operation.  */
+  if (num_old_inv_drs + num_const_init_simple_reduc * 2 >= datarefs.length ())
+return false;
+

can you, instead of carrying num_const_init_simple_reduc simply loop
over m_reductions
and classify them in this function accordingly?  I think we want to
cost non-constant-init
reductions as well.  The :? can eventually count for another DR for
cost purposes.

It looks like we do count the existing DRs for the reduction?  Is that
why you arrive
at the num_const_init_simple_reduc * 2 figure? (one extra load plus one ?:)
But we don't really know whether the DR was invariant in the outer
loop (well, I suppose
we could remember the DR in m_reductions).

Note that the good thing is that the ?: has an invariant condition and
thus vectorization
can hoist the mask generation out of the vectorized loop which means
it boils down to
cheap operations.  My gut feeling is that just looking at the number
of memory references
isn't a good indicator of profitability as the regular stmt workload
has a big impact on
profitability of vectorization.

So no ack nor nack...

Richard.

> Thanks,
> bin
> 2017-12-08  Bin Cheng  
>
> * gimple-loop-interchange.cc (struct loop_cand): New field.
> (loop_cand::loop_cand): Init new field in constructor.
> (loop_cand::classify_simple_reduction): Record simple reduction
> initialized with constant value.
> (should_interchange_loops): New parameter.  Skip interchange if loop
> has few data references and constant intitialized simple reduction.
> (tree_loop_interchange::interchange): Update call to above function.
> (should_interchange_loop_nest): Ditto.


Re: [fortran] Add support for #pragma GCC unroll v3

2017-12-08 Thread Janne Blomqvist
On Wed, Dec 6, 2017 at 11:21 AM, Eric Botcazou  wrote:
>> 6.1 Extensions implemented in GNU Fortran
>> 7.2 GNU Fortran Compiler Directives
>>
>> 6.1 describes extension covering legacy code and vendor extensions.
>> 7.2 describes other !$GCC directives.  Currently, the section is
>> mainly calling conventions (CDECL, STDCALL, etc) and library
>> macroc (DLLEXPORT).  These should probably be in 7.2.1 and the
>> UNROLL directive in 7.2.2.
>
> Attached is a minimal patch along these lines.

Ok, thanks for the patch.


-- 
Janne Blomqvist


Re: [RFC] Add means to split dump file into several files -- Use in lra

2017-12-08 Thread Richard Biener
On Thu, 7 Dec 2017, Tom de Vries wrote:

> Hi,
> 
> I'm currently debugging a problem in lra, and got a bit lost in the 20k+ lines
> dump file.
> 
> I observed that:
> - the lra dump file is one of the biggest ones
> - lra itself consists of a looping of sub-passes.
> 
> So, I've:
> - written a dump infrastructure addition that can be used within a pass
>   to mark the end of the current (sub)dump file and start a next
>   subdump file.
> - used that infrastructure to instrument lra to dump info from
>   different subpasses into separate files.
> 
> Using this patch I managed to split the reload dump file into smaller bits:
> ...
> $ wc -l *.reload.*
>3 no-scevccp-outer-10.c.276r.reload
>0 no-scevccp-outer-10.c.276r.reload.001.lra_start
>3 no-scevccp-outer-10.c.276r.reload.002.remove_scratches
> 2335 no-scevccp-outer-10.c.276r.reload.003.lra_constraints
> 1781 no-scevccp-outer-10.c.276r.reload.004.lra_create_live_ranges
>  460 no-scevccp-outer-10.c.276r.reload.005.lra_inheritance
>  920 no-scevccp-outer-10.c.276r.reload.006.lra_create_live_ranges
>  563 no-scevccp-outer-10.c.276r.reload.007.lra_assign
>  184 no-scevccp-outer-10.c.276r.reload.008.lra_undo_inheritance
>  830 no-scevccp-outer-10.c.276r.reload.009.lra_create_live_ranges
>3 no-scevccp-outer-10.c.276r.reload.010.lra_coalesce
>  165 no-scevccp-outer-10.c.276r.reload.011.lra_constraints
>  844 no-scevccp-outer-10.c.276r.reload.012.lra_create_live_ranges
>  110 no-scevccp-outer-10.c.276r.reload.013.lra_inheritance
>  879 no-scevccp-outer-10.c.276r.reload.014.lra_create_live_ranges
>   22 no-scevccp-outer-10.c.276r.reload.015.lra_assign
>   74 no-scevccp-outer-10.c.276r.reload.016.lra_undo_inheritance
>   19 no-scevccp-outer-10.c.276r.reload.017.lra_constraints
>  845 no-scevccp-outer-10.c.276r.reload.018.lra_create_live_ranges
>   80 no-scevccp-outer-10.c.276r.reload.019.lra_remat
>   27 no-scevccp-outer-10.c.276r.reload.020.lra_spill
>  866 no-scevccp-outer-10.c.276r.reload.021.lra_constraints
>  830 no-scevccp-outer-10.c.276r.reload.022.lra_create_live_ranges
>0 no-scevccp-outer-10.c.276r.reload.023.lra_inheritance
>  830 no-scevccp-outer-10.c.276r.reload.024.lra_create_live_ranges
>   53 no-scevccp-outer-10.c.276r.reload.025.lra_assign
>5 no-scevccp-outer-10.c.276r.reload.026.lra_constraints
>  370 no-scevccp-outer-10.c.276r.reload.027.lra_finishing
> 4137 no-scevccp-outer-10.c.276r.reload.028.lra_end
>0 no-scevccp-outer-10.c.276r.reload.029.lra_start
>   27 no-scevccp-outer-10.c.276r.reload.030.remove_scratches
>  553 no-scevccp-outer-10.c.276r.reload.031.lra_constraints
>  188 no-scevccp-outer-10.c.276r.reload.032.lra_create_live_ranges
>8 no-scevccp-outer-10.c.276r.reload.033.lra_inheritance
>  188 no-scevccp-outer-10.c.276r.reload.034.lra_create_live_ranges
>   21 no-scevccp-outer-10.c.276r.reload.035.lra_assign
>3 no-scevccp-outer-10.c.276r.reload.036.lra_undo_inheritance
>5 no-scevccp-outer-10.c.276r.reload.037.lra_constraints
>   99 no-scevccp-outer-10.c.276r.reload.038.lra_finishing
>  515 no-scevccp-outer-10.c.276r.reload.039.lra_end
> ...
> 
> Notes:
> - dump info from different functions is not put together
> - this is on by default atm. We probably want to enable this only
>   using a switch fsplit-dump or some such.
> - the lra_end dump files ends with ";; Function ...", which should be in
>   the next lra_start dump file. Once we enable this using a switch we
>   can probably do better.
> 
> Any comments?

Sometimes it's harder to walk through multiple files IMHO.  Merging
ira and lra dumps would be nice.

;)

And you can open a file multiple times so what's exactly the benefit
of having multiple files?

Richard.


Re: [PATCH] Add testcase for PR83252 (was Re: patch to fix PR80818)

2017-12-08 Thread Rainer Orth
Hi Jakub,

> On Wed, Nov 29, 2017 at 05:21:22PM -0500, Vladimir Makarov wrote:
>>   The following patch fixes
>> 
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80818
>> 
>>   The patch was successfully tested and bootstrapped on x86_64. The patch
>> has no test because it is hard to check the problem.  I checked manually
>
> This changed fixed PR83252 which has a reasonably small testcase.
> I've further reduced it using creduce (with -O0 -W{,maybe-}uninitialized
> and/or -fsanitize=undefined checking, plus test that it succeeds with
> r255258 and fails with r255257).  Can you please double check if the
> testcase represents the same issue you were working on or if your change
> merely made the bug latent again?
>
> 2017-12-04  Jakub Jelinek  
>
>   PR target/83252
>   * gcc.target/i386/i386.exp (check_effective_target_bmi2): Moved to ...
>   * lib/target-supports.exp (check_effective_target_bmi2): ... here.  
> Guard with
>   i?86-*-* x86_64-*-*.
>   * g++.dg/opt/pr83252.C: New test.

the new testcase FAILs on Solaris/x86 with /bin/as:

+FAIL: g++.dg/opt/pr83252.C  -std=gnu++11 execution test
+FAIL: g++.dg/opt/pr83252.C  -std=gnu++14 execution test
+FAIL: g++.dg/opt/pr83252.C  -std=gnu++98 execution test

ld.so.1: pr83252.exe: fatal: pr83252.exe: hardware capability (CA_SUNW_HW_2) 
unsupported: 0x80  [ BMI2 ]

Inside gcc.target/i386, clearcap.exp takes care of that.

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


Re: [PATCH GCC]More conservative interchanging small loops with const initialized simple reduction

2017-12-08 Thread Bin.Cheng
On Fri, Dec 8, 2017 at 12:17 PM, Richard Biener
 wrote:
> On Fri, Dec 8, 2017 at 12:46 PM, Bin Cheng  wrote:
>> Hi,
>> This simple patch makes interchange even more conservative for small loops 
>> with constant initialized simple reduction.
>> The reason is undoing such reduction introduces new data reference and 
>> cond_expr, which could cost too much in a small
>> loop.
>> Test gcc.target/aarch64/pr62178.c is fixed with this patch.  Is it OK if 
>> test passes?
>
> Shouldn't we do this even for non-constant initialzied simple
> reduction?  Because for any simple
> reduction we add two DRs that are not innermost, for constant
> initialized we add an additional
> cond-expr.  So ...
>
> +  /* Conservatively skip interchange in cases only have few data references
> + and constant initialized simple reduction since it introduces new data
> + reference as well as ?: operation.  */
> +  if (num_old_inv_drs + num_const_init_simple_reduc * 2 >= datarefs.length 
> ())
> +return false;
> +
>
> can you, instead of carrying num_const_init_simple_reduc simply loop
> over m_reductions
> and classify them in this function accordingly?  I think we want to
> cost non-constant-init
> reductions as well.  The :? can eventually count for another DR for
> cost purposes.
Number of non-constant-init reductions can still be carried in struct
loop_cand?  I am not very sure what's the advantage of an additional
loop over m_reductions getting the same information.
Perhaps the increase of stmts should be counted like:
  num_old_inv_drs + num_const_init_simple_reduc * 2 - num_new_inv_drs
Question is which number should this be compared against.  (we may
need to shift num_new_inv_drs to the other side for wrapping issue).

>
> It looks like we do count the existing DRs for the reduction?  Is that
> why you arrive
> at the num_const_init_simple_reduc * 2 figure? (one extra load plus one ?:)
Yes.
> But we don't really know whether the DR was invariant in the outer
> loop (well, I suppose
Hmm, I might misunderstand here.  num_old_inv_drs tracks the number of
invariant reference with regarding to inner loop, rather than the
outer loop.  The same to num_new_inv_drs,
which means a reference become invariant after loop interchange with
regarding to (the new) inner loop.  This invariant information is
always known from data reference, right?
As for DRs for reduction, we know it's invariant because we set its
inner loop stride to zero.

> we could remember the DR in m_reductions).
>
> Note that the good thing is that the ?: has an invariant condition and
> thus vectorization
> can hoist the mask generation out of the vectorized loop which means
> it boils down to
> cheap operations.  My gut feeling is that just looking at the number
> of memory references
> isn't a good indicator of profitability as the regular stmt workload
> has a big impact on
> profitability of vectorization.
It's not specific to vectorization.  The generated new code also costs
too much in small loops without vectorization.  But yes, # of mem_refs
may be too inaccurate, maybe we should check against num_stmts.

Thanks,
bin
>
> So no ack nor nack...
>
> Richard.
>
>> Thanks,
>> bin
>> 2017-12-08  Bin Cheng  
>>
>> * gimple-loop-interchange.cc (struct loop_cand): New field.
>> (loop_cand::loop_cand): Init new field in constructor.
>> (loop_cand::classify_simple_reduction): Record simple reduction
>> initialized with constant value.
>> (should_interchange_loops): New parameter.  Skip interchange if loop
>> has few data references and constant intitialized simple reduction.
>> (tree_loop_interchange::interchange): Update call to above function.
>> (should_interchange_loop_nest): Ditto.


Re: [PATCH] Add testcase for PR83252 (was Re: patch to fix PR80818)

2017-12-08 Thread Jakub Jelinek
On Fri, Dec 08, 2017 at 01:38:36PM +0100, Rainer Orth wrote:
> Hi Jakub,
> 
> > On Wed, Nov 29, 2017 at 05:21:22PM -0500, Vladimir Makarov wrote:
> >>   The following patch fixes
> >> 
> >> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80818
> >> 
> >>   The patch was successfully tested and bootstrapped on x86_64. The patch
> >> has no test because it is hard to check the problem.  I checked manually
> >
> > This changed fixed PR83252 which has a reasonably small testcase.
> > I've further reduced it using creduce (with -O0 -W{,maybe-}uninitialized
> > and/or -fsanitize=undefined checking, plus test that it succeeds with
> > r255258 and fails with r255257).  Can you please double check if the
> > testcase represents the same issue you were working on or if your change
> > merely made the bug latent again?
> >
> > 2017-12-04  Jakub Jelinek  
> >
> > PR target/83252
> > * gcc.target/i386/i386.exp (check_effective_target_bmi2): Moved to ...
> > * lib/target-supports.exp (check_effective_target_bmi2): ... here.  
> > Guard with
> > i?86-*-* x86_64-*-*.
> > * g++.dg/opt/pr83252.C: New test.
> 
> the new testcase FAILs on Solaris/x86 with /bin/as:
> 
> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++11 execution test
> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++14 execution test
> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++98 execution test
> 
> ld.so.1: pr83252.exe: fatal: pr83252.exe: hardware capability (CA_SUNW_HW_2) 
> unsupported: 0x80  [ BMI2 ]
> 
> Inside gcc.target/i386, clearcap.exp takes care of that.

This can't be in gcc.target/i386/, because the test has to be C++ (doesn't
fail in C).

So dg-skip-if on Solaris, or { target { bmi2 && { ! *-*-solaris* } } } for
the -mbmi2 option?

Jakub


Re: [PATCH] Add testcase for PR83252 (was Re: patch to fix PR80818)

2017-12-08 Thread Rainer Orth
Hi Jakub,

>> the new testcase FAILs on Solaris/x86 with /bin/as:
>> 
>> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++11 execution test
>> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++14 execution test
>> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++98 execution test
>> 
>> ld.so.1: pr83252.exe: fatal: pr83252.exe: hardware capability
>> (CA_SUNW_HW_2) unsupported: 0x80 [ BMI2 ]
>> 
>> Inside gcc.target/i386, clearcap.exp takes care of that.
>
> This can't be in gcc.target/i386/, because the test has to be C++ (doesn't
> fail in C).

I see; hadn't checked...

> So dg-skip-if on Solaris, or { target { bmi2 && { ! *-*-solaris* } } } for
> the -mbmi2 option?

... or { dg-additional-options "-mclear-hwcap" { *-*-solaris* } }

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


Re: [PATCH] Add testcase for PR83252 (was Re: patch to fix PR80818)

2017-12-08 Thread Jakub Jelinek
On Fri, Dec 08, 2017 at 01:47:35PM +0100, Rainer Orth wrote:
> >> the new testcase FAILs on Solaris/x86 with /bin/as:
> >> 
> >> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++11 execution test
> >> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++14 execution test
> >> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++98 execution test
> >> 
> >> ld.so.1: pr83252.exe: fatal: pr83252.exe: hardware capability
> >> (CA_SUNW_HW_2) unsupported: 0x80 [ BMI2 ]
> >> 
> >> Inside gcc.target/i386, clearcap.exp takes care of that.
> >
> > This can't be in gcc.target/i386/, because the test has to be C++ (doesn't
> > fail in C).
> 
> I see; hadn't checked...
> 
> > So dg-skip-if on Solaris, or { target { bmi2 && { ! *-*-solaris* } } } for
> > the -mbmi2 option?
> 
> ... or { dg-additional-options "-mclear-hwcap" { *-*-solaris* } }

If it works, sure.  I can't test it though.

Jakub


Re: [PATCH, rs6000] Add missing builtin functionality and tests

2017-12-08 Thread Segher Boessenkool
Hi Carl,

On Thu, Dec 07, 2017 at 08:56:07AM -0800, Carl Love wrote:
>   * config/rs6000/altivec.h (vec_extract_fp32_from_short[h|l]): Add 
> #defines.

Line too long.

>   * gcc.target/powerpc/builtins-3-p8.c (test_vsi_packs_[vui|vsi
>   vssi|vusi], test_vsi_packsu-[vssi|vusi|vsll|vull|vsi|vui]): Add
>   testcases. Add dg-final tests for new instructions.
>   * gcc.target/powerpc/p8vector-builtin-2.c (v[b|bs|u]char_eq,
>   v[b|s|i|u]int_eq, vbool_eq, v[b|s|u]int_ne, vbool_ne,
>   vsign_ne, vuns_ne, vbshort_ne): Add tests. Add dg-final instruction
>   tests.

>   * gcc.target/powerpc/builtins-3.c 
> (test_sll_v[sc|uc|si]_v[sc|uc|si]_v[s|uc]):

If you don't write out the names people cannot find them easily.

> +/* Expected results for Big Endian:
> + vec_packpx vpkpx
> + vec_vmulosbvmulesb

> +/* { dg-final { scan-assembler-times "vmulesb" 0 } } */

Those don't agree.  Also test for vmulosb?  (Also in the LE test).

Did you test this targeting older CPUs?  If not, expect fallout.

The patch is okay for trunk (with the trivial fixes).  Thanks!


Segher


Re: [PATCH, rs6000] Add additional builtin tests

2017-12-08 Thread Segher Boessenkool
Hi!

> +// *** TODO:  add checs for the adds tests

Did you forget this part?

Also the same trivial things as the previous patch.

The rest looks fine, okay for trunk.  Thanks!


Segher


PING [PATCH] i386: Avoid PLT when shadow stack is enabled directly

2017-12-08 Thread H.J. Lu
On Tue, Oct 24, 2017 at 5:31 AM, H.J. Lu  wrote:
> PLT should be avoided with shadow stack in 32-bit mode if more than 2
> parameters are passed in registers since only 2 parameters can be passed
> in registers for external function calls via PLT with shadow stack
> enabled.
>
> OK for trunk if there is no regressions?

Here is the updated patch.

PING.

-- 
H.J.
From 7a2bea21f2854f1a7ad44c90fbb687f70c039144 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Tue, 24 Oct 2017 05:24:58 -0700
Subject: [PATCH] i386: Avoid PLT when shadow stack is enabled

PLT should be avoided with shadow stack in 32-bit mode if more than 2
parameters are passed in registers since only 2 parameters can be passed
in registers for external function calls via PLT with shadow stack
enabled.

gcc/

	PR target/81780
	* config/i386/i386.c (ix86_noplt_attribute_p): New function.
	(ix86_expand_call): Use it.
	(ix86_nopic_noplt_attribute_p): Likewise.

gcc/testsuite/

	PR target/81780
	* gcc.target/i386/pr81780-1.c: New test.
	* gcc.target/i386/pr81780-2.c: Likewise.
	* gcc.target/i386/pr81780-3.c: Likewise.
	* gcc.target/i386/pr81780-4.c: Likewise.
---
 gcc/config/i386/i386.c| 38 +--
 gcc/testsuite/gcc.target/i386/pr81780-1.c | 14 
 gcc/testsuite/gcc.target/i386/pr81780-2.c | 14 
 gcc/testsuite/gcc.target/i386/pr81780-3.c | 14 
 gcc/testsuite/gcc.target/i386/pr81780-4.c | 14 
 5 files changed, 82 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr81780-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr81780-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr81780-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr81780-4.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index e323102cef5..a7df509a43e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -102,6 +102,7 @@ static bool ix86_save_reg (unsigned int, bool, bool);
 static bool ix86_function_naked (const_tree);
 static bool ix86_notrack_prefixed_insn_p (rtx);
 static void ix86_emit_restore_reg_using_pop (rtx);
+static bool ix86_noplt_attribute_p (rtx);
 
 
 #ifndef CHECK_STACK_LIMIT
@@ -27880,10 +27881,7 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
 	  && GET_CODE (addr) == SYMBOL_REF
 	  && !SYMBOL_REF_LOCAL_P (addr))
 	{
-	  if (flag_plt
-	  && (SYMBOL_REF_DECL (addr) == NULL_TREE
-		  || !lookup_attribute ("noplt",
-	DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)
+	  if (!ix86_noplt_attribute_p (addr))
 	{
 	  if (!TARGET_64BIT
 		  || (ix86_cmodel == CM_LARGE_PIC
@@ -28060,6 +28058,29 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
   return call;
 }
 
+/* Return true if the function being called was marked with attribute
+   "noplt" or using -fno-plt.  PLT should be avoided with shadow stack
+   if more than 2 parameters are passed in registers since only 2
+   parameters can be passed in registers for external function calls
+   via PLT with shadow stack enabled.  */
+
+static bool
+ix86_noplt_attribute_p (rtx call_op)
+{
+  if (!flag_plt)
+return true;
+
+  tree decl = SYMBOL_REF_DECL (call_op);
+  if (!decl)
+return false;
+
+  return (lookup_attribute ("noplt", DECL_ATTRIBUTES (decl))
+	  || (!TARGET_64BIT
+	  && (flag_cf_protection & CF_RETURN)
+	  && TARGET_SHSTK
+	  && ix86_function_regparm (TREE_TYPE (decl), decl) > 2));
+}
+
 /* Return true if the function being called was marked with attribute
"noplt" or using -fno-plt and we are compiling for non-PIC.  We need
to handle the non-PIC case in the backend because there is no easy
@@ -28076,14 +28097,7 @@ ix86_nopic_noplt_attribute_p (rtx call_op)
   || SYMBOL_REF_LOCAL_P (call_op))
 return false;
 
-  tree symbol_decl = SYMBOL_REF_DECL (call_op);
-
-  if (!flag_plt
-  || (symbol_decl != NULL_TREE
-  && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl
-return true;
-
-  return false;
+  return ix86_noplt_attribute_p (call_op);
 }
 
 /* Output the assembly for a call instruction.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr81780-1.c b/gcc/testsuite/gcc.target/i386/pr81780-1.c
new file mode 100644
index 000..f5a28020782
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr81780-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { *-*-linux* && ia32 } } } */
+/* { dg-require-effective-target got32x_reloc } */
+/* { dg-options "-fcf-protection -mcet -O2 -fno-pic -fplt" } */
+
+extern void bar (int) __attribute__ ((regparm (3)));
+
+int
+foo (int i)
+{
+  bar (i);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "call\[ \t\]*.bar@GOT" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr81780-2.c b/gcc/testsuite/gcc.target/i386/pr81780-2.c
new file mode 100644
index 000..59f4c5f4d70
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr81780-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { *-*-linux* && ia32 } } } */
+/* { dg-require-eff

Re: [PATCH] Add testcase for PR83252 (was Re: patch to fix PR80818)

2017-12-08 Thread Rainer Orth
Hi Jakub,

> On Fri, Dec 08, 2017 at 01:47:35PM +0100, Rainer Orth wrote:
>> >> the new testcase FAILs on Solaris/x86 with /bin/as:
>> >> 
>> >> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++11 execution test
>> >> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++14 execution test
>> >> +FAIL: g++.dg/opt/pr83252.C  -std=gnu++98 execution test
>> >> 
>> >> ld.so.1: pr83252.exe: fatal: pr83252.exe: hardware capability
>> >> (CA_SUNW_HW_2) unsupported: 0x80 [ BMI2 ]
>> >> 
>> >> Inside gcc.target/i386, clearcap.exp takes care of that.
>> >
>> > This can't be in gcc.target/i386/, because the test has to be C++ (doesn't
>> > fail in C).
>> 
>> I see; hadn't checked...
>> 
>> > So dg-skip-if on Solaris, or { target { bmi2 && { ! *-*-solaris* } } } for
>> > the -mbmi2 option?
>> 
>> ... or { dg-additional-options "-mclear-hwcap" { *-*-solaris* } }
>
> If it works, sure.  I can't test it though.

it does if you add a "target" in the right place ;-)  Tested on
i386-pc-solaris2.11, sparc-sun-solaris2.11, and x86_64-pc-linux-gnu,
installed on mainline.

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


2017-12-08  Rainer Orth  

* g++.dg/opt/pr83252.C: Add -mclear-hwcap on *-*-solaris*.

# HG changeset patch
# Parent  a1d6e5b19ac25366630bf95a1d1d9b333b0b47ca
Disable hwcap on Solaris in g++.dg/opt/pr83252.C

diff --git a/gcc/testsuite/g++.dg/opt/pr83252.C b/gcc/testsuite/g++.dg/opt/pr83252.C
--- a/gcc/testsuite/g++.dg/opt/pr83252.C
+++ b/gcc/testsuite/g++.dg/opt/pr83252.C
@@ -2,6 +2,7 @@
 // { dg-do run }
 // { dg-options "-O3" }
 // { dg-additional-options "-mbmi2 -mtune=intel" { target bmi2 } }
+// { dg-additional-options "-mclear-hwcap" { target *-*-solaris* } }
 
 #if __SIZEOF_INT__ == 4 && __SIZEOF_LONG_LONG__ == 8 && __CHAR_BIT__ == 8
 


[PATCH] rl78 umindi3 improvement

2017-12-08 Thread Sebastian Perta
Hello,

The following patch improves both the speed and code size for 64 bit
unsigned min for RL78:
it emits a library function call instead of emitting code for  the 64 bit
min for every single time.
The unsigned min function which was added in libgcc is hand written, so more
optimal than what GCC generates.

The change can easily be seen on the following test case:
unsigned long long my_smaxdi3(unsigned long long x, unsigned long long y){ 
return (x < y)? x : y;
}
I did not add this to the regression as it very simple and there are test
cases in the regression which test this, for example
gcc.c-torture/execute/pr49039.c and gcc.dg/torture/pr25718-1.c.
Regression test is OK, tested with the following command:
make -k check-gcc RUNTESTFLAGS=--target_board=rl78-sim

Please let me know if this is OK, Thank you!
Sebastian

Index: gcc/ChangeLog
===
--- gcc/ChangeLog   (revision 255471)
+++ gcc/ChangeLog   (working copy)
@@ -1,3 +1,7 @@
+2017-12-07  Sebastian Perta  
+
+   * config/rl78/rl78.md: New define_expand "umindi3".
+   
 2017-12-07  Vladimir Makarov  
 
PR target/83252
Index: gcc/config/rl78/rl78.md
===
--- gcc/config/rl78/rl78.md (revision 255471)
+++ gcc/config/rl78/rl78.md (working copy)
@@ -234,6 +234,16 @@
DONE;"
 )
 
+(define_expand "umindi3"
+ [(set (match_operand:DI  0 "nonimmediate_operand" "")
+   (umin:DI (match_operand:DI 1 "general_operand"  "")
+(match_operand:DI2 "general_operand"  "")))
+   ]
+  "optimize_size"
+  "rl78_emit_libcall (\"__umindi3\", UMIN, DImode, DImode, 3, operands);
+   DONE;"
+)
+
 (define_insn "addsi3_internal_virt"
   [(set (match_operand:SI  0 "nonimmediate_operand" "=v,&vm, vm")
(plus:SI (match_operand:SI 1 "general_operand"  "0, vim, vim")
Index: libgcc/ChangeLog
===
--- libgcc/ChangeLog(revision 255471)
+++ libgcc/ChangeLog(working copy)
@@ -1,3 +1,8 @@
+2017-12-07  Sebastian Perta  
+
+   * config/rl78/umindi3.S: New assembly file.
+   * config/rl78/t-rl78: Added umindi3.S to LIB2ADD.
+   
 2017-11-30  Michael Meissner  
 
* config/rs6000/_mulkc3.c (__mulkc3): Add forward declaration.
Index: libgcc/config/rl78/t-rl78
===
--- libgcc/config/rl78/t-rl78   (revision 255471)
+++ libgcc/config/rl78/t-rl78   (working copy)
@@ -32,7 +32,8 @@
$(srcdir)/config/rl78/fpmath-sf.S \
$(srcdir)/config/rl78/cmpsi2.S \
$(srcdir)/config/rl78/adddi3.S \
-   $(srcdir)/config/rl78/subdi3.S
+   $(srcdir)/config/rl78/subdi3.S \
+   $(srcdir)/config/rl78/umindi3.S
 
 LIB2FUNCS_EXCLUDE = _clzhi2 _clzsi2 _ctzhi2 _ctzsi2 \
   _popcounthi2 _popcountsi2 \
Index: libgcc/config/rl78/umindi3.S
===
--- libgcc/config/rl78/umindi3.S(nonexistent)
+++ libgcc/config/rl78/umindi3.S(working copy)
@@ -0,0 +1,74 @@
+;   Copyright (C) 2017 Free Software Foundation, Inc.
+;   Contributed by Sebastian Perta.
+; 
+; This file is free software; you can redistribute it and/or modify it
+; under the terms of the GNU General Public License as published by the
+; Free Software Foundation; either version 3, or (at your option) any
+; later version.
+; 
+; This file is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; General Public License for more details.
+; 
+; Under Section 7 of GPL version 3, you are granted additional
+; permissions described in the GCC Runtime Library Exception, version
+; 3.1, as published by the Free Software Foundation.
+;
+; You should have received a copy of the GNU General Public License and
+; a copy of the GCC Runtime Library Exception along with this program;
+; see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+; .
+
+
+#include "vregs.h"
+
+.text
+
+START_FUNC ___umindi3
+
+; copy first argument/operand to the output registers
+movw   ax, [sp+4]
+movw   r8, ax
+movw   ax, [sp+6]
+movw   r10, ax
+movw   ax, [sp+8]
+movw   r12, ax
+movw   ax, [sp+10]
+movw   r14, ax
+
+; use 16-bit compares from the most significant words downto the least
significant ones
+movw   ax, [sp+18]
+cmpw   ax, r14
+bc $.L1
+bnz$.L2
+
+movw   ax, [sp+16]
+cmpw   ax, r12
+bc $.L1
+bnz$.L2
+
+movw   ax, [sp+14]
+cmpw   ax, r10
+bc $.L1
+bnz$.L2
+
+movw   ax, [sp+12]
+cmpw   ax, r8
+bc $.L1
+ret
+
+.L1:
+; copy second argument/operand to the output registers
+movw   ax, [sp+12]
+movw   r8, ax
+

Re: [PATCH] Further improvements for the (T)(P+A)-(T)(P+B) folding (PR sanitizer/81281)

2017-12-08 Thread Jakub Jelinek
On Thu, Dec 07, 2017 at 09:28:32PM +0100, Marc Glisse wrote:
> On Thu, 7 Dec 2017, Jakub Jelinek wrote:
> 
> > When committing the previous PR81281 patch, I've removed all the @@0 cases
> > on plus:c, used @0 instead, to make sure we don't regress.
> > 
> > This patch readds those where possible.  For the cases where there is
> > just P and A, it was mostly a matter of @@0 and convert? instead of convert
> > plus using type from @1 instead of @0, though if @0 is INTEGER_CST, what we
> > usually end up with is a (plus (convert (plus @1 @0) @2) where @2 negated
> > is equal to @0, so the patch adds a simplification for that too.
> 
> There may be a bit of overlap with "(A +- CST1) +- CST2 -> A + CST3"
> elsewhere in the file. Do you think there is a convenient way to generalize
> it so it also covers this case, or does it look better to keep them
> separate? (I haven't had time to study your recent patches yet, so I don't

I don't think it is similar enough to what this pattern does, both solve
quite different problems, have quite different tests and pretty much nothing
shareable.

Jakub


PING [PATCH] i386: Insert ENDBR before the profiling counter call

2017-12-08 Thread H.J. Lu
On Tue, Oct 24, 2017 at 10:58 AM, H.J. Lu  wrote:
> On Tue, Oct 24, 2017 at 10:40 AM, Andi Kleen  wrote:
>> "H.J. Lu"  writes:
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pr82699-4.c
>>> @@ -0,0 +1,11 @@
>>> +/* { dg-do compile { target { *-*-linux* && { ! ia32 } } } } */
>>> +/* { dg-options "-O2 -fpic -fcf-protection -mcet -pg -mfentry 
>>> -fasynchronous-unwind-tables" } */
>>> +/* { dg-final { scan-assembler-times {\t\.cfi_startproc\n\tendbr} 1 }
>>> } */
>>
>> Would add test cases for -mnop-mcount and -mrecord-mcount too
>>
>
> Here is the updated patch to add a testcase for -mnop-mcount -mrecord-mcount.
> No other changes otherwise.
>

PING.

https://gcc.gnu.org/ml/gcc-patches/2017-10/msg01741.html

-- 
H.J.


Re: [PATCH] Fix up tree-ssa/strn{cat,cpy-2}.c (PR tree-optimization/83075)

2017-12-08 Thread Jakub Jelinek
On Thu, Dec 07, 2017 at 11:03:10AM -0700, Martin Sebor wrote:
> On 12/07/2017 09:55 AM, Jakub Jelinek wrote:
> > On Wed, Dec 06, 2017 at 05:30:53PM +0100, Jakub Jelinek wrote:
> > > On Wed, Dec 06, 2017 at 09:20:15AM -0700, Martin Sebor wrote:
> > > > Attached is a patch with the comment updated/simplified.
> > > > The tests do the job they need to do today so I just removed
> > > > the useless attribute but otherwise left them unchanged.  If
> > > > you would like to enhance them in some way please feel free.
> > > 
> > > Ok for trunk, with a minor nit.  I'll tweak the tests incrementally
> > > when it is in.
> > 
> > So here is the fix for those testcases.
> > 
> > They didn't test what they meant to test, because they didn't FAIL
> > without the patch.  That is because the bug was that the -W* option
> > affected code generation, so with -O2 -Wno-stringop-overflow it didn't
> > trigger it.
> 
> Doh!  I suppose that underscores that the right way to write test
> cases for optimization bugs is to prune warnings out of their output

No, warnings shouldn't be pruned unless really necessary.

It really depends on what you are testing and whether you want to include
-W* options, or -Wno-*, or none of them, or -w.  E.g. the last one is
default in gcc.c-torture/{compile,execute}.

If you need some warnings enabled, then better just add dg-warning
for the options, then you verify you get the right warnings and if it is
dg-do run also right execution, etc.

The reason why you can't have -Wno-foo here is that the test doesn't FAIL
without it without the fix.  That should be a standard procedure for every
submitted patch which adds a test testing a fix in the same patch.
With a vanilla compiler test that
make check RUNTESTFLAGS==the_test
FAILs and with the patch applied PASSes.  This takes just a couple of
seconds.  Only after this one should bootstrap/regtest it fully.

> I don't have an opinion on the rest of these changes.  I do want
> to make one comment about runtime tests.  I fairly regularly run
> tests with cross-compilers on the build machine.  This lets me
> verify that compile-only tests pass but it doesn't do anything
> for tests that need to run.  In fact, with the current mixture
> of all kinds of tests in the same directory, it pretty much rules
> out drawing any conclusions from test results in this setup.  So
> while I appreciate the additional testing done by the runtime
> tests, I think ideally, having compile time only tests would be
> the baseline requirement and runtime tests would be a separate
> layer that would provide additional validation when possible.

Depends on what is being tested, properly written valid (without UB) runtime
tests are extremely useful, they test more than compile time only tests can
check and even in the future often they will catch bugs in other parts of
the compiler.  One of the reason why I don't like unit testing too much...

And no, I don't think a compile time test should be added if we are already
adding a runtime test that covers it too and better.

Jakub


Re: [AArch64] Fix ICEs in aarch64_print_operand

2017-12-08 Thread Christophe Lyon
Hi Richard,


On 7 December 2017 at 10:31, James Greenhalgh  wrote:
> On Tue, Dec 05, 2017 at 05:57:37PM +, Richard Sandiford wrote:
>> Three related regression fixes:
>>
>> - We can't use asserts like:
>>
>> gcc_assert (GET_MODE_SIZE (mode) == 16);
>>
>>   in aarch64_print_operand because it could trigger for invalid user input.
>>
>> - The output_operand_lossage in aarch64_print_address_internal:
>>
>> output_operand_lossage ("invalid operand for '%%%c'", op);
>>
>>   wasn't right because "op" is an rtx_code enum rather than the
>>   prefix character.
>>
>> - aarch64_print_operand_address shouldn't call output_operand_lossage
>>   (because it doesn't have a prefix code) but instead fall back to
>>   output_addr_const.
>>
>> Tested on aarch64-linux-gnu.  OK to install?
>
> OK.
>
> Thanks,
> James
>
>>
>> Thanks,
>> Richard
>>
>>
>> 2017-12-05  Richard Sandiford  
>>
>> gcc/
>>   * config/aarch64/aarch64.c (aarch64_print_address_internal): Return
>>   a bool success value.  Don't call output_operand_lossage here.
>>   (aarch64_print_ldpstp_address): Return a bool success value.
>>   (aarch64_print_operand_address): Call output_addr_const if
>>   aarch64_print_address_internal fails.
>>   (aarch64_print_operand): Don't assert that the mode is 16 bytes for
>>   'y'; call output_operand_lossage instead.  Call output_operand_lossage
>>   if aarch64_print_ldpstp_address fails.
>>
>> gcc/testsuite/
>>   * gcc.target/aarch64/asm-2.c: New test.
>>   * gcc.target/aarch64/asm-3.c: Likewise.
>>

The new test gcc.target/aarch64/asm-2.c ICEs when compiled with -mabi=ilp32:

/gcc/testsuite/gcc.target/aarch64/asm-2.c: In function 'f':
/gcc/testsuite/gcc.target/aarch64/asm-2.c:10:1: internal compiler
error: in aarch64_print_address_internal, at
config/aarch64/aarch64.c:5636
0xf2afd3 aarch64_print_address_internal
/gcc/config/aarch64/aarch64.c:5636
0xf2affd aarch64_print_operand_address
/gcc/config/aarch64/aarch64.c:5733
0x7fdd43 output_address(machine_mode, rtx_def*)
/gcc/final.c:3913
0x801288 output_asm_insn(char const*, rtx_def**)
/gcc/final.c:3770
0x802437 final_scan_insn(rtx_insn*, _IO_FILE*, int, int, int*)
/gcc/final.c:2673
0x802a1a final(rtx_insn*, _IO_FILE*, int)
/gcc/final.c:2052
0x8035ab rest_of_handle_final
/gcc/final.c:4498
0x8035ab execute
/gcc/final.c:4572

Can you check?

Thanks,

Christophe

>> Index: gcc/config/aarch64/aarch64.c
>> ===
>> --- gcc/config/aarch64/aarch64.c  2017-12-05 14:24:52.477015238 +
>> +++ gcc/config/aarch64/aarch64.c  2017-12-05 17:54:56.466247227 +
>> @@ -150,7 +150,7 @@ static bool aarch64_builtin_support_vect
>>bool is_packed);
>>  static machine_mode
>>  aarch64_simd_container_mode (scalar_mode mode, unsigned width);
>> -static void aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx 
>> x);
>> +static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
>>
>>  /* Major revision number of the ARM Architecture implemented by the target. 
>>  */
>>  unsigned aarch64_architecture_version;
>> @@ -5600,22 +5600,21 @@ #define buf_size 20
>>{
>>   machine_mode mode = GET_MODE (x);
>>
>> - if (GET_CODE (x) != MEM)
>> + if (GET_CODE (x) != MEM
>> + || (code == 'y' && GET_MODE_SIZE (mode) != 16))
>> {
>>   output_operand_lossage ("invalid operand for '%%%c'", code);
>>   return;
>> }
>>
>>   if (code == 'y')
>> -   {
>> - /* LDP/STP which uses a single double-width memory operand.
>> -Adjust the mode to appear like a typical LDP/STP.
>> -Currently this is supported for 16-byte accesses only.  */
>> - gcc_assert (GET_MODE_SIZE (mode) == 16);
>> - mode = DFmode;
>> -   }
>> +   /* LDP/STP which uses a single double-width memory operand.
>> +  Adjust the mode to appear like a typical LDP/STP.
>> +  Currently this is supported for 16-byte accesses only.  */
>> +   mode = DFmode;
>>
>> - aarch64_print_ldpstp_address (f, mode, XEXP (x, 0));
>> + if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
>> +   output_operand_lossage ("invalid operand prefix '%%%c'", code);
>>}
>>break;
>>
>> @@ -5628,7 +5627,7 @@ #define buf_size 20
>>  /* Print address 'x' of a memory access with mode 'mode'.
>> 'op' is the context required by aarch64_classify_address.  It can either 
>> be
>> MEM for a normal memory access or PARALLEL for LDP/STP.  */
>> -static void
>> +static bool
>>  aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x, RTX_CODE 
>> op)
>>  {
>>struct aarch64_address_info addr;
>> @@ -5645,7 +5644,7 @@ aarch64_print_address_internal (FILE *f,
>>   else
>> asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
>>   

C++ patch ping

2017-12-08 Thread Jakub Jelinek
Hi!

I'd like to ping two patches:

http://gcc.gnu.org/ml/gcc-patches/2017-11/msg02521.html
  PR c++/83205 - diagnose invalid std::tuple_size::value for structured
 bindings; the follow-up with plural spelling is approved
 already

http://gcc.gnu.org/ml/gcc-patches/2017-11/msg02629.html
  PR c++/83217 - handle references to non-complete type in structured
 bindings

Jakub


Re: [PR80693] drop value of parallel SETs dropped by combine

2017-12-08 Thread Segher Boessenkool
Hi!

On Thu, Dec 07, 2017 at 07:01:03PM -0200, Alexandre Oliva wrote:

[ Big snip; thanks for all the detailed information! ]

> Still, I'm concerned I haven't caught all of the cases in which
> adjusting REG_N_SETS would be needed: we might have dropped multiple
> SETs of the same pseudo, each with its own REG_UNUSED note (say, from
> all of i3, i2, i1, and i0), and the current logic will only decrement
> REG_N_SETS once, and only if i2 no longer sets the register.

The only thing that matters for combine is that if REG_N_SETS == 1, then
it must be correct.  So "forgetting" to decrement is okay, forgetting
to increment is not.

> > It wasn't obvious to me (this code is horribly complicated).  Whether
> > all existing code is correct...  it's probably best not to look too
> > closely :-/
> 
> You're right about its being horribly complicated.

I would like to remove all regstat things from combine, use "real" DF
instead.  This will simplify many things, and allow better optimisation
as well.  But is it fast enough?  We'll see.

> > If you have a patch you feel confident in, could you post it again
> > please?
> 
> So let me tell you how I feel about this.  It has waited long enough,
> and there are at least 3 bugs known to be fixed by the first very simple
> patch below.  The catch is that it doesn't adjust REG_N_SETS at all (we
> didn't before the patch, and that didn't seem to hurt too much).  I've
> regstrapped this successfully on x86_64-linux-gnu and i686-linux-gnu.


>   PR rtl-optimization/80693
>   PR rtl-optimization/81019
>   PR rtl-optimization/81020
>   * combine.c (distribute_notes): Reset any REG_UNUSED REGs that
>   are not mentioned in i3.  Place the REG_UNUSED note on i2,
>   possibly modified to REG_DEAD, if it did not originate in i3.
> 
> for  gcc/testsuite/ChangeLog
> 
>   PR rtl-optimization/80693
>   PR rtl-optimization/81019
>   PR rtl-optimization/81020
>   * gcc.dg/pr80693.c: New.
>   * gcc.dg/pr81019.c: New.


> But then it occurred to me that the REG_UNUSED register from i1 might
> have *also* been set in i2, and we might then have used it as a scratch
> register for split, while the set that the REG_UNUSED pertained to might
> have been completely discarded from a parallel in i1 or i0.  And then I
> wasn't unsure whether or not to decrement REG_N_SETS.

[ more snip ]

> Then I realized we might have to also reset the cached value when it's
> referenced in i3, and possibly also adjust the counts.  And also some of
> that when it's set in i3.  Then I lost it.  Help? :-)
> 
> FWIW, I'd be glad just installing the patch between --->cut<---s, or
> even a simpler version thereof that just resets the recorded value and
> doesn't even attempt to place notes in i2, to get the bugs fixed while
> we sort out the really tricky issues of adjusting REG_N_SETS.  The
> (incomplete, but functional) fix has been known for a while, and we
> shouldn't subject users to wrong code when we can help it, even if we
> might miss optimization opportunities for that, right?  Thoughts?

Yes, that first patch is okay for trunk.  Thanks for all the work on this!

I don't think this patch makes anything worse, and it does make some things
better.


Segher


Re: [PATCH GCC]More conservative interchanging small loops with const initialized simple reduction

2017-12-08 Thread Richard Biener
On Fri, Dec 8, 2017 at 1:43 PM, Bin.Cheng  wrote:
> On Fri, Dec 8, 2017 at 12:17 PM, Richard Biener
>  wrote:
>> On Fri, Dec 8, 2017 at 12:46 PM, Bin Cheng  wrote:
>>> Hi,
>>> This simple patch makes interchange even more conservative for small loops 
>>> with constant initialized simple reduction.
>>> The reason is undoing such reduction introduces new data reference and 
>>> cond_expr, which could cost too much in a small
>>> loop.
>>> Test gcc.target/aarch64/pr62178.c is fixed with this patch.  Is it OK if 
>>> test passes?
>>
>> Shouldn't we do this even for non-constant initialzied simple
>> reduction?  Because for any simple
>> reduction we add two DRs that are not innermost, for constant
>> initialized we add an additional
>> cond-expr.  So ...
>>
>> +  /* Conservatively skip interchange in cases only have few data references
>> + and constant initialized simple reduction since it introduces new data
>> + reference as well as ?: operation.  */
>> +  if (num_old_inv_drs + num_const_init_simple_reduc * 2 >= datarefs.length 
>> ())
>> +return false;
>> +
>>
>> can you, instead of carrying num_const_init_simple_reduc simply loop
>> over m_reductions
>> and classify them in this function accordingly?  I think we want to
>> cost non-constant-init
>> reductions as well.  The :? can eventually count for another DR for
>> cost purposes.
> Number of non-constant-init reductions can still be carried in struct
> loop_cand?  I am not very sure what's the advantage of an additional
> loop over m_reductions getting the same information.
> Perhaps the increase of stmts should be counted like:
>   num_old_inv_drs + num_const_init_simple_reduc * 2 - num_new_inv_drs
> Question is which number should this be compared against.  (we may
> need to shift num_new_inv_drs to the other side for wrapping issue).
>
>>
>> It looks like we do count the existing DRs for the reduction?  Is that
>> why you arrive
>> at the num_const_init_simple_reduc * 2 figure? (one extra load plus one ?:)
> Yes.
>> But we don't really know whether the DR was invariant in the outer
>> loop (well, I suppose
> Hmm, I might misunderstand here.  num_old_inv_drs tracks the number of
> invariant reference with regarding to inner loop, rather than the
> outer loop.  The same to num_new_inv_drs,
> which means a reference become invariant after loop interchange with
> regarding to (the new) inner loop.  This invariant information is
> always known from data reference, right?
> As for DRs for reduction, we know it's invariant because we set its
> inner loop stride to zero.
>
>> we could remember the DR in m_reductions).
>>
>> Note that the good thing is that the ?: has an invariant condition and
>> thus vectorization
>> can hoist the mask generation out of the vectorized loop which means
>> it boils down to
>> cheap operations.  My gut feeling is that just looking at the number
>> of memory references
>> isn't a good indicator of profitability as the regular stmt workload
>> has a big impact on
>> profitability of vectorization.
> It's not specific to vectorization.  The generated new code also costs
> too much in small loops without vectorization.  But yes, # of mem_refs
> may be too inaccurate, maybe we should check against num_stmts.

Not specific to vectorization but the interchange may pay off only when
vectorizing a loop.  Would the loop in loop-interchange-5.c be still
interchanged?  If we remove the multiplication and just keep
c[i][j] = c[i][j] + b[k][j];
?  That is, why is the constant init so special?  Even for non-constant init
we're changing two outer loop DRs to two non-consecutive inner loop DRs.

Richard.

> Thanks,
> bin
>>
>> So no ack nor nack...
>>
>> Richard.
>>
>>> Thanks,
>>> bin
>>> 2017-12-08  Bin Cheng  
>>>
>>> * gimple-loop-interchange.cc (struct loop_cand): New field.
>>> (loop_cand::loop_cand): Init new field in constructor.
>>> (loop_cand::classify_simple_reduction): Record simple reduction
>>> initialized with constant value.
>>> (should_interchange_loops): New parameter.  Skip interchange if loop
>>> has few data references and constant intitialized simple reduction.
>>> (tree_loop_interchange::interchange): Update call to above function.
>>> (should_interchange_loop_nest): Ditto.


[PATCH] Fix PR81782

2017-12-08 Thread Richard Biener

The following fixes spurious uninit warnings for zero-sized arrays.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2017-12-08  Richard Biener  

PR middle-end/81782
* tree-ssa-uninit.c (warn_uninitialized_vars): Properly
handle accesses outside of zero-sized vars.

* gcc.dg/uninit-pr81782.c: New testcase.

Index: gcc/tree-ssa-uninit.c
===
--- gcc/tree-ssa-uninit.c   (revision 255499)
+++ gcc/tree-ssa-uninit.c   (working copy)
@@ -296,8 +296,8 @@ warn_uninitialized_vars (bool warn_possi
 variable.  */
  if (DECL_P (base)
  && ref.size != -1
- && ref.max_size == ref.size
- && (ref.offset + ref.size <= 0
+ && ((ref.max_size == ref.size
+  && ref.offset + ref.size <= 0)
  || (ref.offset >= 0
  && DECL_SIZE (base)
  && TREE_CODE (DECL_SIZE (base)) == INTEGER_CST
Index: gcc/testsuite/gcc.dg/uninit-pr81782.c
===
--- gcc/testsuite/gcc.dg/uninit-pr81782.c   (nonexistent)
+++ gcc/testsuite/gcc.dg/uninit-pr81782.c   (working copy)
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-Wmaybe-uninitialized" } */
+
+int
+foo (void)
+{
+  char empty_array[] = { };
+  int i, ret = 0;
+
+  for (i = 0; i < (int) (sizeof (empty_array) / sizeof (empty_array[0])); i++)
+ret = empty_array[i]; /* { dg-bogus "uninitialized" } */
+
+  return ret;
+}


[PATCH] rl78 anddi3 improvement

2017-12-08 Thread Sebastian Perta
Hello,

The following patch improves code size for 64 bit and for RL78:
it emits a library function call instead of emitting code for  the 64 bit
min for every single time.
The and function which was added in libgcc is hand written, so more optimal
than what GCC generates.

The change can easily be seen on the following test case:
unsigned long long my_anddi3(unsigned long long x, unsigned long long y){ 
return x & y;
}
I did not add this to the regression as it very simple and there are test
cases in the regression which test this, for example
gcc.dg/torture/stackalign/alloca-1.c  and
gcc.dg/torture/stackalign/vararg-2.c.
Regression test is OK, tested with the following command:
make -k check-gcc RUNTESTFLAGS=--target_board=rl78-sim

Please let me know if this is OK, Thank you!
Sebastian

Index: gcc/ChangeLog
===
--- gcc/ChangeLog   (revision 255511)
+++ gcc/ChangeLog   (working copy)
@@ -1,3 +1,7 @@
+2017-12-08  Sebastian Perta  
+
+   * config/rl78/rl78.md: New define_expand "anddi3".
+   
 2017-12-08  Martin Jambor  
 
PR tree-optimization/83141
Index: gcc/config/rl78/rl78.md
===
--- gcc/config/rl78/rl78.md (revision 255511)
+++ gcc/config/rl78/rl78.md (working copy)
@@ -718,3 +718,13 @@
   [(set_attr "valloc" "macax")
(set_attr "is_g13_muldiv_insn" "yes")]
 )
+
+(define_expand "anddi3"
+ [(set (match_operand:DI  0 "nonimmediate_operand" "")
+   (and:DI (match_operand:DI 1 "general_operand"  "")
+(match_operand:DI2 "general_operand"  "")))
+   ]
+  "optimize_size"
+  "rl78_emit_libcall (\"__anddi3\", AND, DImode, DImode, 3, operands);
+   DONE;"
+)
Index: libgcc/ChangeLog
===
--- libgcc/ChangeLog(revision 255511)
+++ libgcc/ChangeLog(working copy)
@@ -1,3 +1,8 @@
+2017-12-08  Sebastian Perta  
+
+   * config/rl78/anddi3.S: New assembly file.
+   * config/rl78/t-rl78: Added anddi3.S to LIB2ADD.
+   
 2017-11-30  Michael Meissner  
 
* config/rs6000/_mulkc3.c (__mulkc3): Add forward declaration.
Index: libgcc/config/rl78/anddi3.S
===
--- libgcc/config/rl78/anddi3.S (nonexistent)
+++ libgcc/config/rl78/anddi3.S (working copy)
@@ -0,0 +1,66 @@
+;   Copyright (C) 2017 Free Software Foundation, Inc.
+;   Contributed by Sebastian Perta.
+; 
+; This file is free software; you can redistribute it and/or modify it
+; under the terms of the GNU General Public License as published by the
+; Free Software Foundation; either version 3, or (at your option) any
+; later version.
+; 
+; This file is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; General Public License for more details.
+; 
+; Under Section 7 of GPL version 3, you are granted additional
+; permissions described in the GCC Runtime Library Exception, version
+; 3.1, as published by the Free Software Foundation.
+;
+; You should have received a copy of the GNU General Public License and
+; a copy of the GCC Runtime Library Exception along with this program;
+; see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+; .
+
+
+#include "vregs.h"
+
+.text
+
+START_FUNC ___anddi3
+
+movw  hl, sp
+
+mov   a, [hl+4]
+and   a, [hl+12]
+mov   r8, a
+
+mov   a, [hl+5]
+and   a, [hl+13]
+mov   r9, a
+
+mov   a, [hl+6]
+and   a, [hl+14]
+mov   r10, a
+
+mov   a, [hl+7]
+and   a, [hl+15]
+mov   r11, a
+
+mov   a, [hl+8]
+and   a, [hl+16]
+mov   r12, a
+
+mov   a, [hl+9]
+and   a, [hl+17]
+mov   r13, a
+
+mov   a, [hl+10]
+and   a, [hl+18]
+mov   r14, a
+
+mov   a, [hl+11]
+and   a, [hl+19]
+mov   r15, a
+
+ret
+
+END_FUNC ___anddi3
Index: libgcc/config/rl78/t-rl78
===
--- libgcc/config/rl78/t-rl78   (revision 255511)
+++ libgcc/config/rl78/t-rl78   (working copy)
@@ -32,7 +32,8 @@
$(srcdir)/config/rl78/fpmath-sf.S \
$(srcdir)/config/rl78/cmpsi2.S \
$(srcdir)/config/rl78/adddi3.S \
-   $(srcdir)/config/rl78/subdi3.S
+   $(srcdir)/config/rl78/subdi3.S \
+   $(srcdir)/config/rl78/anddi3.S
 
 LIB2FUNCS_EXCLUDE = _clzhi2 _clzsi2 _ctzhi2 _ctzsi2 \
   _popcounthi2 _popcountsi2 \



[PATCH][GCC][ARM] Fix failing testcase pragma_fpu_attribute.c

2017-12-08 Thread Tamar Christina
Hi All,

My previous patch had two issues with the new test cases.
It seems that depending on which DejaGnu version you have 
dg-additional-options will add the options before or after the
ones added by the test suite. Which means I can't use it to override
the default options.

For this I use a pragma now and place the pragma before GCC needs to emit
any code. Which in turn means it doesn't emit the .fpu directive for the first
switching of fpus.

Secondly, because of the usage of neon I also need to guard against arm_neon_ok.

Regtested on arm-none-eabi and no regressions.

Ok for trunk?


gcc/testsuite/
2017-12-08  Tamar Christina  

PR target/82641
* gcc.target/arm/pragma_fpu_attribute.c: New.
* gcc.target/arm/pragma_fpu_attribute_2.c: New.

-- 
diff --git a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c
index f47c745855e4acc099afd554838dcf7d031f798c..5f039d9bfb2b14f9134f138527fc395b8e273bbb 100644
--- a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c
+++ b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c
@@ -1,11 +1,14 @@
 /* Test for target attribute assembly extension generations.  */
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-additional-options "-std=gnu99 -mfpu=vfpv3-d16" } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-additional-options "-std=gnu99" } */
 
 #include 
 #include 
 
+#pragma GCC target("fpu=vfpv3-d16")
+
 extern uint32_t bar();
 
 __attribute__((target("fpu=crypto-neon-fp-armv8"))) poly64x1_t vsricw(poly64x1_t crc, uint32_t val)
diff --git a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c
index f23fd83779e57e48c0035b6688a21850d12cb4ab..b710de38612707b9109966f7bbc694a913121cb6 100644
--- a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c
+++ b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c
@@ -1,11 +1,14 @@
 /* Test for #pragma assembly extension generations.  */
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-additional-options "-std=gnu99 -mfpu=vfpv3-d16" } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-additional-options "-std=gnu99" } */
 
 #include 
 #include 
 
+#pragma GCC target("fpu=vfpv3-d16")
+
 extern uint32_t bar();
 
 #pragma GCC push_options



Re: [PATCH] Fix PR81782

2017-12-08 Thread Jakub Jelinek
On Fri, Dec 08, 2017 at 03:44:03PM +0100, Richard Biener wrote:
> 
> The following fixes spurious uninit warnings for zero-sized arrays.
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.
> 
> Richard.
> 
> 2017-12-08  Richard Biener  
> 
>   PR middle-end/81782
>   * tree-ssa-uninit.c (warn_uninitialized_vars): Properly
>   handle accesses outside of zero-sized vars.

Is ref.max_size == -1 always guaranteeing no access before offset, only
after it?
What I fear is e.g. ARRAY_REF with non-constant index with base of
MEM_REF with & of a middle of array or something similar?
struct S { int x[64]; } foo;
MEM_REF[&foo, 40][i] with negative i.

Anyway, in this case it is about not printing a warning, so we can give up
even if it is theoretically possible.

> --- gcc/tree-ssa-uninit.c (revision 255499)
> +++ gcc/tree-ssa-uninit.c (working copy)
> @@ -296,8 +296,8 @@ warn_uninitialized_vars (bool warn_possi
>variable.  */
> if (DECL_P (base)
> && ref.size != -1
> -   && ref.max_size == ref.size
> -   && (ref.offset + ref.size <= 0
> +   && ((ref.max_size == ref.size
> +&& ref.offset + ref.size <= 0)
> || (ref.offset >= 0
> && DECL_SIZE (base)
> && TREE_CODE (DECL_SIZE (base)) == INTEGER_CST

Jakub


Re: [PATCH GCC]More conservative interchanging small loops with const initialized simple reduction

2017-12-08 Thread Bin.Cheng
On Fri, Dec 8, 2017 at 2:40 PM, Richard Biener
 wrote:
> On Fri, Dec 8, 2017 at 1:43 PM, Bin.Cheng  wrote:
>> On Fri, Dec 8, 2017 at 12:17 PM, Richard Biener
>>  wrote:
>>> On Fri, Dec 8, 2017 at 12:46 PM, Bin Cheng  wrote:
 Hi,
 This simple patch makes interchange even more conservative for small loops 
 with constant initialized simple reduction.
 The reason is undoing such reduction introduces new data reference and 
 cond_expr, which could cost too much in a small
 loop.
 Test gcc.target/aarch64/pr62178.c is fixed with this patch.  Is it OK if 
 test passes?
>>>
>>> Shouldn't we do this even for non-constant initialzied simple
>>> reduction?  Because for any simple
>>> reduction we add two DRs that are not innermost, for constant
>>> initialized we add an additional
>>> cond-expr.  So ...
>>>
>>> +  /* Conservatively skip interchange in cases only have few data references
>>> + and constant initialized simple reduction since it introduces new data
>>> + reference as well as ?: operation.  */
>>> +  if (num_old_inv_drs + num_const_init_simple_reduc * 2 >= datarefs.length 
>>> ())
>>> +return false;
>>> +
>>>
>>> can you, instead of carrying num_const_init_simple_reduc simply loop
>>> over m_reductions
>>> and classify them in this function accordingly?  I think we want to
>>> cost non-constant-init
>>> reductions as well.  The :? can eventually count for another DR for
>>> cost purposes.
>> Number of non-constant-init reductions can still be carried in struct
>> loop_cand?  I am not very sure what's the advantage of an additional
>> loop over m_reductions getting the same information.
>> Perhaps the increase of stmts should be counted like:
>>   num_old_inv_drs + num_const_init_simple_reduc * 2 - num_new_inv_drs
>> Question is which number should this be compared against.  (we may
>> need to shift num_new_inv_drs to the other side for wrapping issue).
>>
>>>
>>> It looks like we do count the existing DRs for the reduction?  Is that
>>> why you arrive
>>> at the num_const_init_simple_reduc * 2 figure? (one extra load plus one ?:)
>> Yes.
>>> But we don't really know whether the DR was invariant in the outer
>>> loop (well, I suppose
>> Hmm, I might misunderstand here.  num_old_inv_drs tracks the number of
>> invariant reference with regarding to inner loop, rather than the
>> outer loop.  The same to num_new_inv_drs,
>> which means a reference become invariant after loop interchange with
>> regarding to (the new) inner loop.  This invariant information is
>> always known from data reference, right?
>> As for DRs for reduction, we know it's invariant because we set its
>> inner loop stride to zero.
>>
>>> we could remember the DR in m_reductions).
>>>
>>> Note that the good thing is that the ?: has an invariant condition and
>>> thus vectorization
>>> can hoist the mask generation out of the vectorized loop which means
>>> it boils down to
>>> cheap operations.  My gut feeling is that just looking at the number
>>> of memory references
>>> isn't a good indicator of profitability as the regular stmt workload
>>> has a big impact on
>>> profitability of vectorization.
>> It's not specific to vectorization.  The generated new code also costs
>> too much in small loops without vectorization.  But yes, # of mem_refs
>> may be too inaccurate, maybe we should check against num_stmts.
>
> Not specific to vectorization but the interchange may pay off only when
> vectorizing a loop.  Would the loop in loop-interchange-5.c be still
> interchanged?  If we remove the multiplication and just keep
> c[i][j] = c[i][j] + b[k][j];
Both loop-interchange-5.c and the modified version are interchange,
because we check
it against number of all data references (including num_old_inv_drs):
 if (num_old_inv_drs + num_const_init_simple_reduc * 2 >= datarefs.length ())

> ?  That is, why is the constant init so special?  Even for non-constant init
> we're changing two outer loop DRs to two non-consecutive inner loop DRs.
No, the two outer loop DRs becomes consecutive with respect to inner loop.
So for a typical matrix mul case, the interchange moves two outer loop
DRs into inner loops, moves an inner loop DR out to outer loop.
Overall it introduces an additional instruction.  In terms of cache
behavior, it's even cheaper?  Because the two new DRs access the same
memory object.
As for pr62178.c, assembly regressed from:
.L3:
ldr w3, [x0], 124
ldr w4, [x2, 4]!
cmp x0, x5
maddw1, w4, w3, w1
bne .L3

To:
.L3:
ldr w2, [x3, x0, lsl 2]
cmp w5, 1
ldr w1, [x4, x0, lsl 2]
cselw2, w2, wzr, ne
maddw1, w6, w1, w2
str w1, [x3, x0, lsl 2]
add x0, x0, 1
cmp x0, 31
bne .L3

Without vectorization.  Two additional instruction for cond_expr, one
additional memory reference for interchange, and one additional
instruction because of ivopt/address

wwwdocs: Power ISA documentation

2017-12-08 Thread Segher Boessenkool
Hi!

The Power ISA documentation is now available without registration.

I have committed the wwwdocs patch below.

Enjoy,


Segher


Index: htdocs/readings.html
===
RCS file: /cvs/gcc/wwwdocs/htdocs/readings.html,v
retrieving revision 1.283
diff -u -3 -p -r1.283 readings.html
--- htdocs/readings.html5 Nov 2017 11:28:20 -   1.283
+++ htdocs/readings.html8 Dec 2017 15:15:24 -
@@ -245,6 +245,7 @@ names.
  
  rs6000 (powerpc, powerpcle)
   Manufacturer: IBM, Motorola
+  https://www.ibm.com/systems/power/openpower/";>Power ISA
   https://members.openpowerfoundation.org/document/dl/576";>64-Bit ELF V2 
ABI - OpenPOWER ABI
   http://publib16.boulder.ibm.com/pseries/en_US/infocenter/base/43_docs/aixassem/alangref/toc.htm";>AIX
 V4.3 Assembler Language Ref.
   http://publibn.boulder.ibm.com/doc_link/en_US/a_doc_lib/aixassem/alangref/alangreftfrm.htm";>AIX
 5L Assembler Language Ref.


Re: [PATCH] rl78 anddi3 improvement

2017-12-08 Thread Jeff Law
On 12/08/2017 07:50 AM, Sebastian Perta wrote:
> Hello,
> 
> The following patch improves code size for 64 bit and for RL78:
> it emits a library function call instead of emitting code for  the 64 bit
> min for every single time.
> The and function which was added in libgcc is hand written, so more optimal
> than what GCC generates.
> 
> The change can easily be seen on the following test case:
> unsigned long long my_anddi3(unsigned long long x, unsigned long long y){ 
> return x & y;
> }
> I did not add this to the regression as it very simple and there are test
> cases in the regression which test this, for example
> gcc.dg/torture/stackalign/alloca-1.c  and
> gcc.dg/torture/stackalign/vararg-2.c.
> Regression test is OK, tested with the following command:
> make -k check-gcc RUNTESTFLAGS=--target_board=rl78-sim
> 
> Please let me know if this is OK, Thank you!
> Sebastian
> 
> Index: gcc/ChangeLog
> ===
> --- gcc/ChangeLog (revision 255511)
> +++ gcc/ChangeLog (working copy)
> @@ -1,3 +1,7 @@
> +2017-12-08  Sebastian Perta  
> +
> + * config/rl78/rl78.md: New define_expand "anddi3".
So I think you're ultimately far better off determining why GCC does not
generate efficient code for 64bit logicals on the rl78 target.

Once you start defining these kind of operations you'll often end up
defining more and more rather than addressing the core underlying issues.

Furthermore, defining multi-word operations will inhibit optimizations
that would have applied had the multi-word operation been broken down
into word-sized operations.  I've seen this time and time again when
32bit targets define DImode operations.  So not only will you end up
with a lot of these patterns, you then end up defining lots of special
cases for them to catch some of hte missed optimizations.

DJ has the final call here, but I think you'd be better off looking
deeper and trying to understand how to get better code for multi-word
operations without defining multi-word libcall expanders like this.

jeff



Re: [PATCH] Fix PR81782

2017-12-08 Thread Richard Biener
On December 8, 2017 3:58:11 PM GMT+01:00, Jakub Jelinek  
wrote:
>On Fri, Dec 08, 2017 at 03:44:03PM +0100, Richard Biener wrote:
>> 
>> The following fixes spurious uninit warnings for zero-sized arrays.
>> 
>> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.
>> 
>> Richard.
>> 
>> 2017-12-08  Richard Biener  
>> 
>>  PR middle-end/81782
>>  * tree-ssa-uninit.c (warn_uninitialized_vars): Properly
>>  handle accesses outside of zero-sized vars.
>
>Is ref.max_size == -1 always guaranteeing no access before offset, only
>after it?

Yes. 

>What I fear is e.g. ARRAY_REF with non-constant index with base of
>MEM_REF with & of a middle of array or something similar?
>struct S { int x[64]; } foo;
>MEM_REF[&foo, 40][i] with negative i.

The array type constraints the access. Invalid accesses invoke undefined 
behavior. 

>Anyway, in this case it is about not printing a warning, so we can give
>up
>even if it is theoretically possible.
>
>> --- gcc/tree-ssa-uninit.c(revision 255499)
>> +++ gcc/tree-ssa-uninit.c(working copy)
>> @@ -296,8 +296,8 @@ warn_uninitialized_vars (bool warn_possi
>>   variable.  */
>>if (DECL_P (base)
>>&& ref.size != -1
>> -  && ref.max_size == ref.size
>> -  && (ref.offset + ref.size <= 0
>> +  && ((ref.max_size == ref.size
>> +   && ref.offset + ref.size <= 0)
>>|| (ref.offset >= 0
>>&& DECL_SIZE (base)
>>&& TREE_CODE (DECL_SIZE (base)) == INTEGER_CST
>
>   Jakub



Re: [PATCH][i386] Fix PR83008

2017-12-08 Thread Jan Hubicka
> 
> This restores the vec_construct cost dependence on the vector element
> count.  Honza removed this (accidentially?) during the rework.
> 
> Bootstrap / regtest running on x86_64-unknown-linux-gnu, ok for trunk?

Hmm, the false parameter to ix86_vec_cost is supposed to do that.  It uses:

if (!parallel)
return cost * GET_MODE_NUNITS (mode);   

Why it doesn't work?

Honza
> 
> Thanks,
> Richard.
> 
> 2017-12-08  Richard Biener  
> 
>   PR target/83008
>   * config/i386/i386.c (ix86_builtin_vectorization_cost): Restore
>   vec_construct dependence on vector element count.
> 
> Index: gcc/config/i386/i386.c
> ===
> --- gcc/config/i386/i386.c(revision 255499)
> +++ gcc/config/i386/i386.c(working copy)
> @@ -44879,7 +44879,8 @@ ix86_builtin_vectorization_cost (enum ve
> ix86_cost->sse_op, true);
>  
>case vec_construct:
> - return ix86_vec_cost (mode, ix86_cost->sse_op, false);
> + return (ix86_vec_cost (mode, ix86_cost->sse_op, false)
> + * (TYPE_VECTOR_SUBPARTS (vectype) - 1));
>  
>default:
>  gcc_unreachable ();


Re: [PATCH 5/3] C++ bits to improve detection of attribute conflicts (PR 81544)

2017-12-08 Thread Martin Sebor

On 12/08/2017 04:30 AM, Jakub Jelinek wrote:

On Fri, Dec 08, 2017 at 11:32:54AM +0100, Rainer Orth wrote:

Jakub Jelinek  writes:


On Fri, Dec 08, 2017 at 10:43:58AM +0100, Rainer Orth wrote:

The line numbers are completely misleading, unfortunately.  Hadn't
SUBTARGET_ATTRIBUTE_TABLE been used at the end of the (very short)
sparc_attribute_table, I wouldn't have seen what was wrong.

The following patch fixes the problem, installed on mainline after
i386-pc-solaris2.11 and sparc-sun-solaris2.11 bootstraps completed
without regressions.

However, there are two more SUBTARGET_ATTRIBUTE_TABLE defines:

gcc/config/darwin.h:#define SUBTARGET_ATTRIBUTE_TABLE \
gcc/config/i386/cygming.h:#define SUBTARGET_ATTRIBUTE_TABLE \


I'll deal with this.


This is what I'm currently testing with an  x86_64-apple-darwin11.4.2
bootstrap.  Ok if it passes?


I've committed this instead, while only darwin.h and cygming.h were left
broken, the comments haven't been adjusted in many more cases.


Sorry about that and thanks for fixing it up.  Hopefully the Solaris
and Darwin files were the only ones I missed.  They were hidden from
my search by macros.

Martin



BTW, we should swap handler and affects_type_identity fields at some point,
3 bools, then pointer, then bool, then pointer needs quite a lot of padding.

2017-12-08  Jakub Jelinek  

* config/arc/arc.c (arc_attribute_table): Add exclusions to
the comment.
* config/avr/avr.c (avr_attribute_table): Likewise.
* config/msp430/msp430.c (msp430_attribute_table): Likewise.
* config/rl78/rl78.c (rl78_attribute_table): Likewise.
* config/nds32/nds32.c (nds32_attribute_table): Likewise.
* config/darwin.h (SUBTARGET_ATTRIBUTE_TABLE): Initialize new member
of struct attribute_spec.
* config/i386/cygming.h (SUBTARGET_ATTRIBUTE_TABLE): Likewise.
ada/
* gcc-interface/utils.c (gnat_internal_attribute_table): Add
exclusions to the comment.
brig/
* brig-lang.c (brig_attribute_table): Fix up comment.

--- gcc/config/arc/arc.c.jj 2017-12-07 18:05:03.0 +0100
+++ gcc/config/arc/arc.c2017-12-08 11:20:24.605501619 +0100
@@ -218,7 +218,7 @@ static tree arc_handle_fndecl_attribute
 const struct attribute_spec arc_attribute_table[] =
 {
  /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
-  affects_type_identity } */
+  affects_type_identity, exclusions } */
   { "interrupt", 1, 1, true, false, false, arc_handle_interrupt_attribute,
   true, NULL },
   /* Function calls made to this symbol must be done indirectly, because
--- gcc/config/avr/avr.c.jj 2017-12-07 18:05:02.0 +0100
+++ gcc/config/avr/avr.c2017-12-08 11:24:32.270391537 +0100
@@ -9875,7 +9875,7 @@ static const struct attribute_spec
 avr_attribute_table[] =
 {
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
-   affects_type_identity } */
+   affects_type_identity, exclusions } */
   { "progmem",   0, 0, false, false, false,  avr_handle_progmem_attribute,
 false, NULL },
   { "signal",0, 0, true,  false, false,  avr_handle_fndecl_attribute,
--- gcc/config/darwin.h.jj  2017-11-28 12:11:41.0 +0100
+++ gcc/config/darwin.h 2017-12-08 11:18:05.866243854 +0100
@@ -741,11 +741,11 @@ extern GTY(()) section * darwin_sections
 /* Extra attributes for Darwin.  */
 #define SUBTARGET_ATTRIBUTE_TABLE   \
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler, \
-   affects_type_identity } */  
 \
+   affects_type_identity, exclusions } */   \
   { "apple_kext_compatibility", 0, 0, false, true, false, \
-darwin_handle_kext_attribute, false },  \
+darwin_handle_kext_attribute, false, NULL },\
   { "weak_import", 0, 0, true, false, false,  \
-darwin_handle_weak_import_attribute, false }
+darwin_handle_weak_import_attribute, false, NULL }

 /* Make local constant labels linker-visible, so that if one follows a
weak_global constant, ld64 will be able to separate the atoms.  */
--- gcc/config/i386/cygming.h.jj2017-10-30 12:02:35.0 +0100
+++ gcc/config/i386/cygming.h   2017-12-08 11:25:12.372887945 +0100
@@ -448,9 +448,9 @@ do {\

 #define SUBTARGET_ATTRIBUTE_TABLE \
   { "selectany", 0, 0, true, false, false, ix86_handle_selectany_attribute, \
-false }
+false, NULL }
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
-   affects_type_identity } */
+   affects_type_identity, exclusions } */

 /*  mcount() does not need a counter variable.  */
 #undef NO_PROFILE_COUNTERS
--- gcc/config/msp430/msp430.c.jj   2017-12-07 18:05:03.0 +0100
+++ gcc/config/msp430/msp4

Re: [AArch64] Fix ICEs in aarch64_print_operand

2017-12-08 Thread Richard Sandiford
Christophe Lyon  writes:
> Hi Richard,
> On 7 December 2017 at 10:31, James Greenhalgh  
> wrote>> On Tue, Dec 05, 2017 at 05:57:37PM +, Richard Sandiford wrote:
>>> Three related regression fixes:
>>>
>>> - We can't use asserts like:
>>>
>>> gcc_assert (GET_MODE_SIZE (mode) == 16);
>>>
>>>   in aarch64_print_operand because it could trigger for invalid user input.
>>>
>>> - The output_operand_lossage in aarch64_print_address_internal:
>>>
>>> output_operand_lossage ("invalid operand for '%%%c'", op);
>>>
>>>   wasn't right because "op" is an rtx_code enum rather than the
>>>   prefix character.
>>>
>>> - aarch64_print_operand_address shouldn't call output_operand_lossage
>>>   (because it doesn't have a prefix code) but instead fall back to
>>>   output_addr_const.
>>>
>>> Tested on aarch64-linux-gnu.  OK to install?
>>
>> OK.
>>
>> Thanks,
>> James
>>
>>>
>>> Thanks,
>>> Richard
>>>
>>>
>>> 2017-12-05  Richard Sandiford  
>>>
>>> gcc/
>>>   * config/aarch64/aarch64.c (aarch64_print_address_internal): Return
>>>   a bool success value.  Don't call output_operand_lossage here.
>>>   (aarch64_print_ldpstp_address): Return a bool success value.
>>>   (aarch64_print_operand_address): Call output_addr_const if
>>>   aarch64_print_address_internal fails.
>>>   (aarch64_print_operand): Don't assert that the mode is 16 bytes for
>>>   'y'; call output_operand_lossage instead.  Call output_operand_lossage
>>>   if aarch64_print_ldpstp_address fails.
>>>
>>> gcc/testsuite/
>>>   * gcc.target/aarch64/asm-2.c: New test.
>>>   * gcc.target/aarch64/asm-3.c: Likewise.
>>>
>
> The new test gcc.target/aarch64/asm-2.c ICEs when compiled with -mabi=ilp32:
>
> /gcc/testsuite/gcc.target/aarch64/asm-2.c: In function 'f':
> /gcc/testsuite/gcc.target/aarch64/asm-2.c:10:1: internal compiler
> error: in aarch64_print_address_internal, at
> config/aarch64/aarch64.c:5636
> 0xf2afd3 aarch64_print_address_internal
> /gcc/config/aarch64/aarch64.c:5636
> 0xf2affd aarch64_print_operand_address
> /gcc/config/aarch64/aarch64.c:5733
> 0x7fdd43 output_address(machine_mode, rtx_def*)
> /gcc/final.c:3913
> 0x801288 output_asm_insn(char const*, rtx_def**)
> /gcc/final.c:3770
> 0x802437 final_scan_insn(rtx_insn*, _IO_FILE*, int, int, int*)
> /gcc/final.c:2673
> 0x802a1a final(rtx_insn*, _IO_FILE*, int)
> /gcc/final.c:2052
> 0x8035ab rest_of_handle_final
> /gcc/final.c:4498
> 0x8035ab execute
> /gcc/final.c:4572
>
> Can you check?

I think that's a separate preexisting problem.  Could you file a PR?

Personally I'd just remove the assert, but I'm guessing that wouldn't
be acceptable...

Thanks,
Richard


Re: [PATCH][i386] Fix PR83008

2017-12-08 Thread Richard Biener
On December 8, 2017 4:51:16 PM GMT+01:00, Jan Hubicka  wrote:
>> 
>> This restores the vec_construct cost dependence on the vector element
>> count.  Honza removed this (accidentially?) during the rework.
>> 
>> Bootstrap / regtest running on x86_64-unknown-linux-gnu, ok for
>trunk?
>
>Hmm, the false parameter to ix86_vec_cost is supposed to do that.  It
>uses:
>
>if (!parallel) 
>  
>return cost * GET_MODE_NUNITS (mode);  
>
>
>Why it doesn't work?

I see. Not exactly the same. I guess we need to see why the costs favor a 16 
element vector in it plus store over 16 scalar stores then... 

See the PR. 

>
>Honza
>> 
>> Thanks,
>> Richard.
>> 
>> 2017-12-08  Richard Biener  
>> 
>>  PR target/83008
>>  * config/i386/i386.c (ix86_builtin_vectorization_cost): Restore
>>  vec_construct dependence on vector element count.
>> 
>> Index: gcc/config/i386/i386.c
>> ===
>> --- gcc/config/i386/i386.c   (revision 255499)
>> +++ gcc/config/i386/i386.c   (working copy)
>> @@ -44879,7 +44879,8 @@ ix86_builtin_vectorization_cost (enum ve
>>ix86_cost->sse_op, true);
>>  
>>case vec_construct:
>> -return ix86_vec_cost (mode, ix86_cost->sse_op, false);
>> +return (ix86_vec_cost (mode, ix86_cost->sse_op, false)
>> +* (TYPE_VECTOR_SUBPARTS (vectype) - 1));
>>  
>>default:
>>  gcc_unreachable ();



[C++ PATCH] Harmonize C++ flexible array member initialization with C (PR c++/80135, PR c++/81922)

2017-12-08 Thread Jakub Jelinek
Hi!

Martin's patch a few years ago started allowing flexible array members
inside of nested aggregates, similarly to what we were doing in C.
But C rejects cases where we in nested context try to initialize a flexible
array member with a non-empty initializer, because that is something that
can't really work. Say if a flexible array member is inside of a struct
and we are initializing an array of such structs, we can't really have
each array element with a different width based on how large was the
initializer for a particular element's flexible array member.
After Martin's change, we were accepting those and silently generating bogus
assembly (claiming some size of elements but the initializer really followed
the sizes of what was added there), then I think Nathan added some
verification and since then we usually just ICE on those.

This patch does the similar thing in the C++ FE to what the C FE does, i.e.
allow empty initializers of flexible array members ( {}, not "" as that is
already non-zero size) everywhere, and for others allow them only for the
outermost struct/class/union.
Allowing the empty flexible array members is IMHO useful, people can have
say some general structure that is sometimes used as toplevel object and
can be initialized with arbitrarily sized array, and sometimes just use it
inside other structs or arrays if the array isn't needed.

digest_init_r already had a nested argument, but it wasn't actually the
nesting this patch is looking for, because nested true is already in
processing the CONSTRUCTOR for flexible array member, so I've changed it to
an int that tracks limited depth information (just 0 (former nested ==
false), 1 and 2 (both former nested == true), where 2 is used when we
digest_init_r once or more times more).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2017-12-08  Jakub Jelinek  

PR c++/80135
PR c++/81922
* typeck2.c (digest_init_r): Change nested argument type from bool to
int.  Use code instead of TREE_CODE (type) where possible.  If
nested == 2, diagnose initialization of flexible array member with
STRING_CST.  Pass nested to process_init_constructor.  Formatting fix.
(digest_init, digest_init_flags): Adjust digest_init_r caller.
(massage_init_elt): Add nested argument.  Pass 2 instead of 1 to
digest_init_r's nested argument if nested is non-zero.
(process_init_constructor_array): Add nested argument.  If nested == 2,
diagnose initialization of flexible array member with non-empty
braced enclosed list.  Pass nested to massage_init_elt.
(process_init_constructor_record, process_init_constructor_union): Add
nested argument, pass it to massage_init_elt.
(process_init_constructor): Add nested argument, pass it to
process_init_constructor_{array,record,union}.
* init.c (find_field_init): Return NULL_TREE if init is
error_mark_node.  Don't look through nested CONSTRUCTORs.

* g++.dg/warn/Wplacement-new-size-1.C (fBx1): Initialize nested
flexible array member only with {}.  Add dg-warning.
(fBx2, fBx3): Remove.
* g++.dg/warn/Wplacement-new-size-2.C (fBx1): Initialize nested
flexible array member only with {}.  Add dg-warning.
(fBx2, fBx3): Remove.
* g++.dg/warn/Wplacement-new-size-6.C: New test.
* g++.dg/ext/flexary13.C (main): Remove test for initialization
of nested flexible array member with non-empty initializer.
* g++.dg/ext/flexary25.C: New test.
* g++.dg/ext/flexary26.C: New test.
* g++.dg/ext/flexary27.C: New test.
* g++.dg/parse/pr43765.C: Expect diagnostics about initialization
of nested flexible array member with non-empty initializer.  Expect
C++2A diagnostics about mixing of designated and non-designated
initializers.

--- gcc/cp/typeck2.c.jj 2017-12-07 18:04:54.362753051 +0100
+++ gcc/cp/typeck2.c2017-12-08 12:55:39.740361230 +0100
@@ -34,7 +34,8 @@ along with GCC; see the file COPYING3.
 #include "intl.h"
 
 static tree
-process_init_constructor (tree type, tree init, tsubst_flags_t complain);
+process_init_constructor (tree type, tree init, int nested,
+ tsubst_flags_t complain);
 
 
 /* Print an error message stemming from an attempt to use
@@ -996,10 +997,11 @@ check_narrowing (tree type, tree init, t
For aggregate types, it assumes that reshape_init has already run, thus the
initializer will have the right shape (brace elision has been undone).
 
-   NESTED is true iff we are being called for an element of a CONSTRUCTOR.  */
+   NESTED is non-zero iff we are being called for an element of a CONSTRUCTOR,
+   2 iff the element of a CONSTRUCTOR is inside another CONSTRUCTOR.  */
 
 static tree
-digest_init_r (tree type, tree init, bool nested, int flags,
+digest_init_r (tree type, tree init, int nested, int flags,
 

[PATCH] Fix 2 typos

2017-12-08 Thread Jakub Jelinek
Hi!

There is no get_base_ref_and_offset function and AFAIK never has been,
my guess is these comments meant get_ref_base_and_extent.
Ok for trunk?

2017-12-08  Jakub Jelinek  

* ipa-polymorphic-call.c (noncall_stmt_may_be_vtbl_ptr_store): Fix
a comment typo, get_base_ref_and_offset -> get_ref_base_and_extent.
* ipa-prop.c (stmt_may_be_vtbl_ptr_store): Likewise.

--- gcc/ipa-polymorphic-call.c.jj   2017-10-10 22:04:02.0 +0200
+++ gcc/ipa-polymorphic-call.c  2017-12-08 15:49:00.883187008 +0100
@@ -1149,7 +1149,7 @@ noncall_stmt_may_be_vtbl_ptr_store (gimp
  if (TREE_CODE (lhs) == COMPONENT_REF
  && !DECL_VIRTUAL_P (TREE_OPERAND (lhs, 1)))
return false;
- /* In the future we might want to use get_base_ref_and_offset to find
+ /* In the future we might want to use get_ref_base_and_extent to find
 if there is a field corresponding to the offset and if so, proceed
 almost like if it was a component ref.  */
}
--- gcc/ipa-prop.c.jj   2017-11-28 22:19:30.0 +0100
+++ gcc/ipa-prop.c  2017-12-08 15:48:43.848397515 +0100
@@ -652,7 +652,7 @@ stmt_may_be_vtbl_ptr_store (gimple *stmt
  if (TREE_CODE (lhs) == COMPONENT_REF
  && !DECL_VIRTUAL_P (TREE_OPERAND (lhs, 1)))
return false;
- /* In the future we might want to use get_base_ref_and_offset to find
+ /* In the future we might want to use get_ref_base_and_extent to find
 if there is a field corresponding to the offset and if so, proceed
 almost like if it was a component ref.  */
}

Jakub


Re: [PATCH GCC]More conservative interchanging small loops with const initialized simple reduction

2017-12-08 Thread Bin.Cheng
On Fri, Dec 8, 2017 at 3:18 PM, Bin.Cheng  wrote:
> On Fri, Dec 8, 2017 at 2:40 PM, Richard Biener
>  wrote:
>> On Fri, Dec 8, 2017 at 1:43 PM, Bin.Cheng  wrote:
>>> On Fri, Dec 8, 2017 at 12:17 PM, Richard Biener
>>>  wrote:
 On Fri, Dec 8, 2017 at 12:46 PM, Bin Cheng  wrote:
> Hi,
> This simple patch makes interchange even more conservative for small 
> loops with constant initialized simple reduction.
> The reason is undoing such reduction introduces new data reference and 
> cond_expr, which could cost too much in a small
> loop.
> Test gcc.target/aarch64/pr62178.c is fixed with this patch.  Is it OK if 
> test passes?

 Shouldn't we do this even for non-constant initialzied simple
 reduction?  Because for any simple
 reduction we add two DRs that are not innermost, for constant
 initialized we add an additional
 cond-expr.  So ...

 +  /* Conservatively skip interchange in cases only have few data 
 references
 + and constant initialized simple reduction since it introduces new 
 data
 + reference as well as ?: operation.  */
 +  if (num_old_inv_drs + num_const_init_simple_reduc * 2 >= 
 datarefs.length ())
 +return false;
 +

 can you, instead of carrying num_const_init_simple_reduc simply loop
 over m_reductions
 and classify them in this function accordingly?  I think we want to
 cost non-constant-init
 reductions as well.  The :? can eventually count for another DR for
 cost purposes.
>>> Number of non-constant-init reductions can still be carried in struct
>>> loop_cand?  I am not very sure what's the advantage of an additional
>>> loop over m_reductions getting the same information.
>>> Perhaps the increase of stmts should be counted like:
>>>   num_old_inv_drs + num_const_init_simple_reduc * 2 - num_new_inv_drs
>>> Question is which number should this be compared against.  (we may
>>> need to shift num_new_inv_drs to the other side for wrapping issue).
>>>

 It looks like we do count the existing DRs for the reduction?  Is that
 why you arrive
 at the num_const_init_simple_reduc * 2 figure? (one extra load plus one ?:)
>>> Yes.
 But we don't really know whether the DR was invariant in the outer
 loop (well, I suppose
>>> Hmm, I might misunderstand here.  num_old_inv_drs tracks the number of
>>> invariant reference with regarding to inner loop, rather than the
>>> outer loop.  The same to num_new_inv_drs,
>>> which means a reference become invariant after loop interchange with
>>> regarding to (the new) inner loop.  This invariant information is
>>> always known from data reference, right?
>>> As for DRs for reduction, we know it's invariant because we set its
>>> inner loop stride to zero.
>>>
 we could remember the DR in m_reductions).

 Note that the good thing is that the ?: has an invariant condition and
 thus vectorization
 can hoist the mask generation out of the vectorized loop which means
 it boils down to
 cheap operations.  My gut feeling is that just looking at the number
 of memory references
 isn't a good indicator of profitability as the regular stmt workload
 has a big impact on
 profitability of vectorization.
>>> It's not specific to vectorization.  The generated new code also costs
>>> too much in small loops without vectorization.  But yes, # of mem_refs
>>> may be too inaccurate, maybe we should check against num_stmts.
>>
>> Not specific to vectorization but the interchange may pay off only when
>> vectorizing a loop.  Would the loop in loop-interchange-5.c be still
>> interchanged?  If we remove the multiplication and just keep
>> c[i][j] = c[i][j] + b[k][j];
> Both loop-interchange-5.c and the modified version are interchange,
> because we check
> it against number of all data references (including num_old_inv_drs):
>  if (num_old_inv_drs + num_const_init_simple_reduc * 2 >= datarefs.length ())
>
>> ?  That is, why is the constant init so special?  Even for non-constant init
>> we're changing two outer loop DRs to two non-consecutive inner loop DRs.
> No, the two outer loop DRs becomes consecutive with respect to inner loop.
> So for a typical matrix mul case, the interchange moves two outer loop
> DRs into inner loops, moves an inner loop DR out to outer loop.
> Overall it introduces an additional instruction.  In terms of cache
> behavior, it's even cheaper?  Because the two new DRs access the same
> memory object.
> As for pr62178.c, assembly regressed from:
> .L3:
> ldr w3, [x0], 124
> ldr w4, [x2, 4]!
> cmp x0, x5
> maddw1, w4, w3, w1
> bne .L3
>
> To:
> .L3:
> ldr w2, [x3, x0, lsl 2]
> cmp w5, 1
> ldr w1, [x4, x0, lsl 2]
> cselw2, w2, wzr, ne
> maddw1, w6, w1, w2
> str w1, [x3, x0, lsl 2]
> add x0, x0, 1
> cmp   

Re: [PATCH] Fix 2 typos

2017-12-08 Thread Martin Jambor
Hi,

On Fri, Dec 08 2017, Jakub Jelinek wrote:
> Hi!
>
> There is no get_base_ref_and_offset function and AFAIK never has been,
> my guess is these comments meant get_ref_base_and_extent.

I am quite sure that is the case.

> Ok for trunk?

Well, my opinion is that it  obviously is.

Thanks,

Martin


>
> 2017-12-08  Jakub Jelinek  
>
>   * ipa-polymorphic-call.c (noncall_stmt_may_be_vtbl_ptr_store): Fix
>   a comment typo, get_base_ref_and_offset -> get_ref_base_and_extent.
>   * ipa-prop.c (stmt_may_be_vtbl_ptr_store): Likewise.
>
> --- gcc/ipa-polymorphic-call.c.jj 2017-10-10 22:04:02.0 +0200
> +++ gcc/ipa-polymorphic-call.c2017-12-08 15:49:00.883187008 +0100
> @@ -1149,7 +1149,7 @@ noncall_stmt_may_be_vtbl_ptr_store (gimp
> if (TREE_CODE (lhs) == COMPONENT_REF
> && !DECL_VIRTUAL_P (TREE_OPERAND (lhs, 1)))
>   return false;
> -   /* In the future we might want to use get_base_ref_and_offset to find
> +   /* In the future we might want to use get_ref_base_and_extent to find
>if there is a field corresponding to the offset and if so, proceed
>almost like if it was a component ref.  */
>   }
> --- gcc/ipa-prop.c.jj 2017-11-28 22:19:30.0 +0100
> +++ gcc/ipa-prop.c2017-12-08 15:48:43.848397515 +0100
> @@ -652,7 +652,7 @@ stmt_may_be_vtbl_ptr_store (gimple *stmt
> if (TREE_CODE (lhs) == COMPONENT_REF
> && !DECL_VIRTUAL_P (TREE_OPERAND (lhs, 1)))
>   return false;
> -   /* In the future we might want to use get_base_ref_and_offset to find
> +   /* In the future we might want to use get_ref_base_and_extent to find
>if there is a field corresponding to the offset and if so, proceed
>almost like if it was a component ref.  */
>   }
>
>   Jakub


Fix PR83323 (crafty miscompile by unroll-and-jam)

2017-12-08 Thread Michael Matz
Hi,

the predicate for feasiblity of unroll-and-jam was a bit nonsensical.  
Properly check for non-head-controlled loops, and also check all BBs of 
the outer loop for any instructions that would inhibit fusion (before we 
never checked header or latch).

Richi also asked me to reuse the -floop-unrool-and-jam option (currently 
aliased to graphites loop-nest-optimize, like a couple other old options), 
instead of adding a new one with slightly different spelling.  This patch 
does that in addition.

Fixes the added testcase and crafty from cpu2000.  Regstrapping on 
x86_64-linux in progress.  Okay if that passes?


Ciao,
Michael.
---
Fix PR83323

* gimple-loop-jam (unroll_jam_possible_p): Correct test for
head-controlled loops and loop BBs.
* common.opt (funroll-and-jam): Remove, instead ...
(floop-unroll-and-jam): ... reuse this option.
* opts.c (default_options_table): Use OPT_floop_unroll_and_jam.
* doc/invoke.texi (-funroll-and-jam): Move docu to ...
(-floop-unroll-and-jam): ... this option.

testsuite/
* gcc.dg/pr83323.c: New test.

diff --git a/gcc/common.opt b/gcc/common.opt
index 6fab2ab..57b3cd7 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1512,8 +1512,8 @@ Common Alias(floop-nest-optimize)
 Enable loop nest transforms.  Same as -floop-nest-optimize.
 
 floop-unroll-and-jam
-Common Alias(floop-nest-optimize)
-Enable loop nest transforms.  Same as -floop-nest-optimize.
+Common Report Var(flag_unroll_jam) Optimization
+Perform unroll-and-jam on loops.
 
 fgnu-tm
 Common Report Var(flag_tm)
@@ -2695,10 +2695,6 @@ fsplit-loops
 Common Report Var(flag_split_loops) Optimization
 Perform loop splitting.
 
-funroll-and-jam
-Common Report Var(flag_unroll_jam) Optimization
-Perform unroll-and-jam on loops.
-
 funwind-tables
 Common Report Var(flag_unwind_tables) Optimization
 Just generate unwind tables for exception handling.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 50740c5..1413095 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -437,7 +437,7 @@ Objective-C and Objective-C++ Dialects}.
 -ftree-reassoc  -ftree-sink  -ftree-slsr  -ftree-sra @gol
 -ftree-switch-conversion  -ftree-tail-merge @gol
 -ftree-ter  -ftree-vectorize  -ftree-vrp  -funconstrained-commons @gol
--funit-at-a-time  -funroll-all-loops  -funroll-loops -funroll-and-jam @gol
+-funit-at-a-time  -funroll-all-loops  -funroll-loops @gol
 -funsafe-math-optimizations  -funswitch-loops @gol
 -fipa-ra  -fvariable-expansion-in-unroller  -fvect-cost-model  -fvpt @gol
 -fweb  -fwhole-program  -fwpa  -fuse-linker-plugin @gol
@@ -8511,11 +8511,9 @@ at @option{-O} and higher.
 @item -ftree-loop-linear
 @itemx -floop-strip-mine
 @itemx -floop-block
-@itemx -floop-unroll-and-jam
 @opindex ftree-loop-linear
 @opindex floop-strip-mine
 @opindex floop-block
-@opindex floop-unroll-and-jam
 Perform loop nest optimizations.  Same as
 @option{-floop-nest-optimize}.  To use this code transformation, GCC has
 to be configured with @option{--with-isl} to enable the Graphite loop
@@ -9789,8 +9787,8 @@ for one side of the iteration space and false for the 
other.
 Move branches with loop invariant conditions out of the loop, with duplicates
 of the loop on both branches (modified according to result of the condition).
 
-@item -funroll-and-jam
-@opindex funroll-and-jam
+@item -floop-unroll-and-jam
+@opindex floop-unroll-and-jam
 Apply unroll and jam transoformations on feasible loops.  In a loop
 nest this unrolls the outer loop by some factor and fuses the resulting
 multiple inner loops.
diff --git a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c
index 32f813b..8ed1bef 100644
--- a/gcc/gimple-loop-jam.c
+++ b/gcc/gimple-loop-jam.c
@@ -152,7 +152,7 @@ merge_loop_tree (struct loop *loop, struct loop *old)
   free (bbs);
 }
 
-/* BB exits the outer loop of an unroll-and-jam situation.
+/* BB is part of the outer loop of an unroll-and-jam situation.
Check if any statements therein would prevent the transformation.  */
 
 static bool
@@ -160,9 +160,10 @@ bb_prevents_fusion_p (basic_block bb)
 {
   gimple_stmt_iterator gsi;
   /* BB is duplicated by outer unrolling and then all N-1 first copies
- move into the body of the fused inner loop.  The last copy remains
- the exit block of the outer loop and is still outside the inner loop
- also after fusion.  We can't allow this for some effects of BB:
+ move into the body of the fused inner loop.  If BB exits the outer loop
+ the last copy still doess so, and the first N-1 copies are cancelled
+ by loop unrolling, so also after fusion it's the exit block.
+ But there might be other reasons that prevent fusion:
* stores or unknown side-effects prevent fusion
* loads don't
* computations into SSA names: these aren't problematic.  Their
@@ -204,6 +205,19 @@ unroll_jam_possible_p (struct loop *outer, struct loop 
*loop)
   if (outer->

Re: [PATCH][i386] Fix PR83008

2017-12-08 Thread Jan Hubicka
> On December 8, 2017 4:51:16 PM GMT+01:00, Jan Hubicka  wrote:
> >> 
> >> This restores the vec_construct cost dependence on the vector element
> >> count.  Honza removed this (accidentially?) during the rework.
> >> 
> >> Bootstrap / regtest running on x86_64-unknown-linux-gnu, ok for
> >trunk?
> >
> >Hmm, the false parameter to ix86_vec_cost is supposed to do that.  It
> >uses:
> >
> >if (!parallel) 
> >  
> >return cost * GET_MODE_NUNITS (mode);  
> >
> >
> >Why it doesn't work?
> 
> I see. Not exactly the same. I guess we need to see why the costs favor a 16 
> element vector in it plus store over 16 scalar stores then... 

Isn't it what you proposed as profitable for Martin Jambor's copy by pieces
issue?  I think the reason is that stores are modeled as having latency (or
cost/2) of 4.  Construction is modelled as simple sse op (latency 1)*number of
parts.

So in this simplified model we cummulate latency of 16 stores as 16*4,
while construction plus one store as 16*1+4.

Honza
> 
> See the PR. 
> 
> >
> >Honza
> >> 
> >> Thanks,
> >> Richard.
> >> 
> >> 2017-12-08  Richard Biener  
> >> 
> >>PR target/83008
> >>* config/i386/i386.c (ix86_builtin_vectorization_cost): Restore
> >>vec_construct dependence on vector element count.
> >> 
> >> Index: gcc/config/i386/i386.c
> >> ===
> >> --- gcc/config/i386/i386.c (revision 255499)
> >> +++ gcc/config/i386/i386.c (working copy)
> >> @@ -44879,7 +44879,8 @@ ix86_builtin_vectorization_cost (enum ve
> >>  ix86_cost->sse_op, true);
> >>  
> >>case vec_construct:
> >> -  return ix86_vec_cost (mode, ix86_cost->sse_op, false);
> >> +  return (ix86_vec_cost (mode, ix86_cost->sse_op, false)
> >> +  * (TYPE_VECTOR_SUBPARTS (vectype) - 1));
> >>  
> >>default:
> >>  gcc_unreachable ();


Re: Fix PR83323 (crafty miscompile by unroll-and-jam)

2017-12-08 Thread Richard Biener
On December 8, 2017 5:44:31 PM GMT+01:00, Michael Matz  wrote:
>Hi,
>
>the predicate for feasiblity of unroll-and-jam was a bit nonsensical.  
>Properly check for non-head-controlled loops, and also check all BBs of
>
>the outer loop for any instructions that would inhibit fusion (before
>we 
>never checked header or latch).
>
>Richi also asked me to reuse the -floop-unrool-and-jam option
>(currently 
>aliased to graphites loop-nest-optimize, like a couple other old
>options), 
>instead of adding a new one with slightly different spelling.  This
>patch 
>does that in addition.
>
>Fixes the added testcase and crafty from cpu2000.  Regstrapping on 
>x86_64-linux in progress.  Okay if that passes?

OK. 

Don't you need to adjust existing unroll and jam testcases? 

Thanks, 
Richard. 

>
>Ciao,
>Michael.
>---
>Fix PR83323
>
>   * gimple-loop-jam (unroll_jam_possible_p): Correct test for
>   head-controlled loops and loop BBs.
>   * common.opt (funroll-and-jam): Remove, instead ...
>   (floop-unroll-and-jam): ... reuse this option.
>   * opts.c (default_options_table): Use OPT_floop_unroll_and_jam.
>   * doc/invoke.texi (-funroll-and-jam): Move docu to ...
>   (-floop-unroll-and-jam): ... this option.
>
>testsuite/
>   * gcc.dg/pr83323.c: New test.
>
>diff --git a/gcc/common.opt b/gcc/common.opt
>index 6fab2ab..57b3cd7 100644
>--- a/gcc/common.opt
>+++ b/gcc/common.opt
>@@ -1512,8 +1512,8 @@ Common Alias(floop-nest-optimize)
> Enable loop nest transforms.  Same as -floop-nest-optimize.
> 
> floop-unroll-and-jam
>-Common Alias(floop-nest-optimize)
>-Enable loop nest transforms.  Same as -floop-nest-optimize.
>+Common Report Var(flag_unroll_jam) Optimization
>+Perform unroll-and-jam on loops.
> 
> fgnu-tm
> Common Report Var(flag_tm)
>@@ -2695,10 +2695,6 @@ fsplit-loops
> Common Report Var(flag_split_loops) Optimization
> Perform loop splitting.
> 
>-funroll-and-jam
>-Common Report Var(flag_unroll_jam) Optimization
>-Perform unroll-and-jam on loops.
>-
> funwind-tables
> Common Report Var(flag_unwind_tables) Optimization
> Just generate unwind tables for exception handling.
>diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>index 50740c5..1413095 100644
>--- a/gcc/doc/invoke.texi
>+++ b/gcc/doc/invoke.texi
>@@ -437,7 +437,7 @@ Objective-C and Objective-C++ Dialects}.
> -ftree-reassoc  -ftree-sink  -ftree-slsr  -ftree-sra @gol
> -ftree-switch-conversion  -ftree-tail-merge @gol
> -ftree-ter  -ftree-vectorize  -ftree-vrp  -funconstrained-commons @gol
>--funit-at-a-time  -funroll-all-loops  -funroll-loops -funroll-and-jam
>@gol
>+-funit-at-a-time  -funroll-all-loops  -funroll-loops @gol
> -funsafe-math-optimizations  -funswitch-loops @gol
>-fipa-ra  -fvariable-expansion-in-unroller  -fvect-cost-model  -fvpt
>@gol
> -fweb  -fwhole-program  -fwpa  -fuse-linker-plugin @gol
>@@ -8511,11 +8511,9 @@ at @option{-O} and higher.
> @item -ftree-loop-linear
> @itemx -floop-strip-mine
> @itemx -floop-block
>-@itemx -floop-unroll-and-jam
> @opindex ftree-loop-linear
> @opindex floop-strip-mine
> @opindex floop-block
>-@opindex floop-unroll-and-jam
> Perform loop nest optimizations.  Same as
>@option{-floop-nest-optimize}.  To use this code transformation, GCC
>has
> to be configured with @option{--with-isl} to enable the Graphite loop
>@@ -9789,8 +9787,8 @@ for one side of the iteration space and false for
>the other.
>Move branches with loop invariant conditions out of the loop, with
>duplicates
>of the loop on both branches (modified according to result of the
>condition).
> 
>-@item -funroll-and-jam
>-@opindex funroll-and-jam
>+@item -floop-unroll-and-jam
>+@opindex floop-unroll-and-jam
> Apply unroll and jam transoformations on feasible loops.  In a loop
>nest this unrolls the outer loop by some factor and fuses the resulting
> multiple inner loops.
>diff --git a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c
>index 32f813b..8ed1bef 100644
>--- a/gcc/gimple-loop-jam.c
>+++ b/gcc/gimple-loop-jam.c
>@@ -152,7 +152,7 @@ merge_loop_tree (struct loop *loop, struct loop
>*old)
>   free (bbs);
> }
> 
>-/* BB exits the outer loop of an unroll-and-jam situation.
>+/* BB is part of the outer loop of an unroll-and-jam situation.
>  Check if any statements therein would prevent the transformation.  */
> 
> static bool
>@@ -160,9 +160,10 @@ bb_prevents_fusion_p (basic_block bb)
> {
>   gimple_stmt_iterator gsi;
>   /* BB is duplicated by outer unrolling and then all N-1 first copies
>- move into the body of the fused inner loop.  The last copy
>remains
>- the exit block of the outer loop and is still outside the inner
>loop
>- also after fusion.  We can't allow this for some effects of BB:
>+ move into the body of the fused inner loop.  If BB exits the
>outer loop
>+ the last copy still doess so, and the first N-1 copies are
>cancelled
>+ by loop unrolling, so also after fusion it's the exit block.
>+ But there might be other reasons t

Re: [PATCH][i386] Fix PR83008

2017-12-08 Thread Richard Biener
On December 8, 2017 5:45:23 PM GMT+01:00, Jan Hubicka  wrote:
>> On December 8, 2017 4:51:16 PM GMT+01:00, Jan Hubicka
> wrote:
>> >> 
>> >> This restores the vec_construct cost dependence on the vector
>element
>> >> count.  Honza removed this (accidentially?) during the rework.
>> >> 
>> >> Bootstrap / regtest running on x86_64-unknown-linux-gnu, ok for
>> >trunk?
>> >
>> >Hmm, the false parameter to ix86_vec_cost is supposed to do that. 
>It
>> >uses:
>> >
>> >if (!parallel)  
>  
>> >  
>> >return cost * GET_MODE_NUNITS (mode);   
>  
>> >
>> >
>> >Why it doesn't work?
>> 
>> I see. Not exactly the same. I guess we need to see why the costs
>favor a 16 element vector in it plus store over 16 scalar stores
>then... 
>
>Isn't it what you proposed as profitable for Martin Jambor's copy by
>pieces
>issue?  I think the reason is that stores are modeled as having latency
>(or
>cost/2) of 4.  Construction is modelled as simple sse op (latency
>1)*number of
>parts.

Yes. 

>So in this simplified model we cummulate latency of 16 stores as 16*4,
>while construction plus one store as 16*1+4.

But vector vs. Scalar cost somehow mess up for avx512... 

>Honza
>> 
>> See the PR. 
>> 
>> >
>> >Honza
>> >> 
>> >> Thanks,
>> >> Richard.
>> >> 
>> >> 2017-12-08  Richard Biener  
>> >> 
>> >>   PR target/83008
>> >>   * config/i386/i386.c (ix86_builtin_vectorization_cost): Restore
>> >>   vec_construct dependence on vector element count.
>> >> 
>> >> Index: gcc/config/i386/i386.c
>> >>
>===
>> >> --- gcc/config/i386/i386.c(revision 255499)
>> >> +++ gcc/config/i386/i386.c(working copy)
>> >> @@ -44879,7 +44879,8 @@ ix86_builtin_vectorization_cost (enum ve
>> >> ix86_cost->sse_op, true);
>> >>  
>> >>case vec_construct:
>> >> - return ix86_vec_cost (mode, ix86_cost->sse_op, false);
>> >> + return (ix86_vec_cost (mode, ix86_cost->sse_op, false)
>> >> + * (TYPE_VECTOR_SUBPARTS (vectype) - 1));
>> >>  
>> >>default:
>> >>  gcc_unreachable ();



Re: [PATCH] Fix 2 typos

2017-12-08 Thread Jeff Law
On 12/08/2017 09:20 AM, Jakub Jelinek wrote:
> Hi!
> 
> There is no get_base_ref_and_offset function and AFAIK never has been,
> my guess is these comments meant get_ref_base_and_extent.
> Ok for trunk?
> 
> 2017-12-08  Jakub Jelinek  
> 
>   * ipa-polymorphic-call.c (noncall_stmt_may_be_vtbl_ptr_store): Fix
>   a comment typo, get_base_ref_and_offset -> get_ref_base_and_extent.
>   * ipa-prop.c (stmt_may_be_vtbl_ptr_store): Likewise.
OK.
jeff


[PATCH, rs6000] (v2) Gimple folding of splat_uX

2017-12-08 Thread Will Schmidt

Hi,
Add support for gimple folding of splat_u{8,16,32}.
Testcase coverage is primarily handled by existing tests
testsuite/gcc.target/powerpc/fold-vec-splat_*.c

One new test added to verify we continue to receive
an 'invalid argument, must be a 5-bit immediate' error
when we try to splat a non-constant value.

V2 updates include..
  Use the gimple_convert() helper.
  Use the build_vector_from_val() helper.
  whitespace fix-ups.
Those changes actually simplify the code here significantly, which is good. :-)

Per comments from last iteration, regarding the early exit for
arg0 != INTEGER_CST, if I remove the early exit when arg0 is not constant,
the code appears to handle things OK, and the generated code looks
reasonable, but this is a behavior change with respect to when folding is
disabled, which is to error out with an ("argument 1 must be a 5-bit signed
literal") error.  For that reason, I'm planning to leave that check in place.
I'll defer to Bill/Segher/David, etc on whether it would be appropriate to
change that behavior.

This intrinsic only takes INTegral types, so no need to check for FLOAT_CST.
(The other vec_splat* intrinsics will deal more with floats, those are later
in my queue).

Regtests across assorted power systems look clean.
OK for trunk?

Thanks,
-Will

[gcc]

2017-12-08  Will Schmidt  

* config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add support for
early folding of splat_u{8,16,32}.

[testsuite]

2017-12-08  Will Schmidt  

* gcc.target/powerpc/fold-vec-splat-misc-invalid.c: New.

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 045a014..8972d80 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -16614,10 +16614,32 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 case VSX_BUILTIN_CMPLE_2DI:
 case VSX_BUILTIN_CMPLE_U2DI:
   fold_compare_helper (gsi, LE_EXPR, stmt);
   return true;
 
+/* flavors of vec_splat_[us]{8,16,32}.  */
+case ALTIVEC_BUILTIN_VSPLTISB:
+case ALTIVEC_BUILTIN_VSPLTISH:
+case ALTIVEC_BUILTIN_VSPLTISW:
+  {
+arg0 = gimple_call_arg (stmt, 0);
+lhs = gimple_call_lhs (stmt);
+/* Only fold the vec_splat_*() if arg0 is constant.  */
+if (TREE_CODE (arg0) != INTEGER_CST)
+  return false;
+gimple_seq stmts = NULL;
+location_t loc = gimple_location (stmt);
+tree splat_value = gimple_convert (&stmts, loc,
+   TREE_TYPE (TREE_TYPE (lhs)), arg0);
+gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value);
+g = gimple_build_assign (lhs, splat_tree);
+gimple_set_location (g, gimple_location (stmt));
+gsi_replace (gsi, g, true);
+return true;
+  }
+
 default:
   if (TARGET_DEBUG_BUILTIN)
fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
 fn_code, fn_name1, fn_name2);
   break;
diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-splat-misc-invalid.c 
b/gcc/testsuite/gcc.target/powerpc/fold-vec-splat-misc-invalid.c
new file mode 100644
index 000..20f5b05
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-splat-misc-invalid.c
@@ -0,0 +1,33 @@
+/* Verify that overloaded built-ins for vec_splat_s8 and vec_splat_s16
+   generate errors as expected when we attempt to use invalid inputs.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-mvsx -O2" } */
+
+#include 
+
+vector signed short
+testss_1 (unsigned int ui)
+{
+  return vec_splat_s16 (ui);/* { dg-error "argument 1 must be a 5-bit signed 
literal" } */
+}
+
+vector unsigned short
+testss_2 (signed int si)
+{
+  return vec_splat_u16 (si);/* { dg-error "argument 1 must be a 5-bit signed 
literal" } */
+}
+
+vector signed char
+testsc_1 (unsigned int ui)
+{
+  return vec_splat_s8 (ui); /* { dg-error "argument 1 must be a 5-bit signed 
literal" } */
+}
+
+vector unsigned char
+testsc_2 (signed int si)
+{
+  return vec_splat_u8 (si);/* { dg-error "argument 1 must be a 5-bit signed 
literal" } */
+}
+




Re: [C++ PATCH] Harmonize C++ flexible array member initialization with C (PR c++/80135, PR c++/81922)

2017-12-08 Thread Martin Sebor

On 12/08/2017 09:15 AM, Jakub Jelinek wrote:

Hi!

Martin's patch a few years ago started allowing flexible array members
inside of nested aggregates, similarly to what we were doing in C.
But C rejects cases where we in nested context try to initialize a flexible
array member with a non-empty initializer, because that is something that
can't really work. Say if a flexible array member is inside of a struct
and we are initializing an array of such structs, we can't really have
each array element with a different width based on how large was the
initializer for a particular element's flexible array member.
After Martin's change, we were accepting those and silently generating bogus
assembly (claiming some size of elements but the initializer really followed
the sizes of what was added there), then I think Nathan added some
verification and since then we usually just ICE on those.

This patch does the similar thing in the C++ FE to what the C FE does, i.e.
allow empty initializers of flexible array members ( {}, not "" as that is
already non-zero size) everywhere, and for others allow them only for the
outermost struct/class/union.
Allowing the empty flexible array members is IMHO useful, people can have
say some general structure that is sometimes used as toplevel object and
can be initialized with arbitrarily sized array, and sometimes just use it
inside other structs or arrays if the array isn't needed.


Harmonizing C++ with C was one of my goals when I made those
changes so rejecting the nested initialization makes sense.
I suspect I just missed that C errors out on these and assumed
it allowed them like it does at the top level (rather than
deliberately relaxing the rules), so thank you for tightening
it up.

While testing the patch I noticed it issues some diagnostics
multiple times.  It would be nice if the last (redundant)
pedantic warning in the error case below could be avoided.

struct B { int i; struct { int j, a[]; } a; };

struct B g (void)
{
  return (struct B){ 1, { 2, { 3 } } };
}

u.C:1:37: warning: ISO C++ forbids flexible array member ‘a’ [-Wpedantic]
 struct B { int i; struct { int j, a[]; } a; };
 ^
u.C:1:42: warning: invalid use of ‘struct B::’ with a flexible 
array member in ‘struct B’ [-Wpedantic]

 struct B { int i; struct { int j, a[]; } a; };
  ^
u.C:1:37: note: array member ‘int Ba []’ declared here
 struct B { int i; struct { int j, a[]; } a; };
 ^
u.C: In function ‘B g()’:
u.C:5:38: warning: ISO C++ forbids compound-literals [-Wpedantic]
   return (struct B){ 1, { 2, { 3 } } };
  ^
u.C:5:38: warning: initialization of a flexible array member [-Wpedantic]
u.C:5:38: error: initialization of flexible array member in a nested context

Martin



Re: [C++ PATCH] Harmonize C++ flexible array member initialization with C (PR c++/80135, PR c++/81922)

2017-12-08 Thread Jakub Jelinek
On Fri, Dec 08, 2017 at 10:25:48AM -0700, Martin Sebor wrote:
> While testing the patch I noticed it issues some diagnostics
> multiple times.  It would be nice if the last (redundant)
> pedantic warning in the error case below could be avoided.

You mean completely drop the pedwarn "initialization of a flexible array member"
in all cases, because in theory we should have emitted already
"ISO C++ forbids flexible array member" pedwarn earlier?
I think it is better to pedwarn in both spots, the earlier pedwarn
might be disabled because of system headers or whatever else.

Or do you just mean to pedwarn only if we are not emitting the
"initialization of flexible array member in a nested context" error?
That is going to be harder, as that error is emitted in a different spot
(actually 2, because the STRING_CST vs. brace enclosed initializer is
separate), unlike the pedwarn is complain & tf_error guarded, etc.

Jakub


Re: [C++ PATCH] Harmonize C++ flexible array member initialization with C (PR c++/80135, PR c++/81922)

2017-12-08 Thread Martin Sebor

On 12/08/2017 10:34 AM, Jakub Jelinek wrote:

On Fri, Dec 08, 2017 at 10:25:48AM -0700, Martin Sebor wrote:

While testing the patch I noticed it issues some diagnostics
multiple times.  It would be nice if the last (redundant)
pedantic warning in the error case below could be avoided.


You mean completely drop the pedwarn "initialization of a flexible array member"
in all cases, because in theory we should have emitted already
"ISO C++ forbids flexible array member" pedwarn earlier?
I think it is better to pedwarn in both spots, the earlier pedwarn
might be disabled because of system headers or whatever else.


I agree.



Or do you just mean to pedwarn only if we are not emitting the
"initialization of flexible array member in a nested context" error?
That is going to be harder, as that error is emitted in a different spot
(actually 2, because the STRING_CST vs. brace enclosed initializer is
separate), unlike the pedwarn is complain & tf_error guarded, etc.


I meant the latter.  It's no big deal if it's too hard.  It's
just something I noticed and thought might be easy to handle.

Martin



Re: Fix PR83323 (crafty miscompile by unroll-and-jam)

2017-12-08 Thread Michael Matz
Hi,

On Fri, 8 Dec 2017, Richard Biener wrote:

> OK. 
> Don't you need to adjust existing unroll and jam testcases? 

Ahem... that's at least what the regression testing told me as well :)  
Fixed in what I checked in (the only testcase we had was the one I added 
yesterday)


Ciao,
Michael.



[PATCH PR83320]Fix new/free mismatch issue

2017-12-08 Thread Bin Cheng
Hi,
While I am still trying to reproduce and verify the issue (valgrind checking 
runs very slow for me),
It's clear I made stupid mistake using free for newed vector.  This simple 
patch fixes it.
Bootstrap and test ongoing.  Is it OK?

Thanks,
bin
2017-12-06  Bin Cheng  

PR tree-optimization/83320
* gimple-loop-interchange.cc (free_data_refs_with_aux): Use delete.
(prune_datarefs_not_in_loop): Ditto.diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
index 3f7c54f..92c96a3 100644
--- a/gcc/gimple-loop-interchange.cc
+++ b/gcc/gimple-loop-interchange.cc
@@ -945,7 +945,7 @@ free_data_refs_with_aux (vec datarefs)
 if (dr->aux != NULL)
   {
DR_ACCESS_STRIDE (dr)->release ();
-   free (dr->aux);
+   delete (vec *) dr->aux;
   }
 
   free_data_refs (datarefs);
@@ -1843,7 +1843,7 @@ prune_datarefs_not_in_loop (struct loop *loop, 
vec datarefs)
  if (dr->aux)
{
  DR_ACCESS_STRIDE (dr)->release ();
- free (dr->aux);
+ delete (vec *) dr->aux;
}
  free_data_ref (dr);
}


Re: [PATCH] Fix result for conditional reductions matching at index 0

2017-12-08 Thread Jakub Jelinek
Hi!

What is going on with this patch, will anybody commit it?

This is also http://gcc.gnu.org/PR80631 apparently.

The patch doesn't apply cleanly to current trunk due to the
reduc_code -> reduc_fn changes.

As it doesn't apply, I can't easily check what the patch generates
on the PR80631 testcases vs. my thoughts on that; though, if it emits
something more complicated for the simple cases, perhaps we could improve
those (not handle it like COND_REDUCTION, but instead as former
INTEGER_INDUC_COND_REDUCTION and just use a different constant instead of 0
if 0 isn't usable for the condition never matched value.  We could
check the value from the loop preheader edge of the induc_stmt phi
and if it is constant and for REDUC_MAX (does that imply always positive
step?) larger than minimum value of the type, we can use the minimum of
the type as the value instead of zero.  If REDUC_MIN and the initial constant
is smaller than the maximum value of the type, we could use the maximum.
Otherwise punt to what you're doing.  Or instead of minimum or maximum check
the value of initial_def, if it is constant, if that value is usable, we
could even avoid the final COND_EXPR.

Another thing is that is_nonwrapping_integer_induction has:
  /* Make sure the loop is integer based.  */
  if (TREE_CODE (base) != INTEGER_CST
  || TREE_CODE (step) != INTEGER_CST)
return false;

  /* Check that the induction increments.  */
  if (tree_int_cst_sgn (step) == -1)
return false;

  if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
return true;

First of all, I fail to see why we don't handle negative step,
that can be done with REDUC_MIN instead of REDUC_MAX.

And, I also fail to see why we need to require INTEGER_CST base if
TYPE_OVERFLOW_UNDEFINED (lhs_type), in that case we know it will not wrap,
so all we care about is if step * ni won't cover all iterations and we can
use some value (type minimum or maximum) to denote the condition never
true case.

On Wed, Nov 22, 2017 at 05:57:08PM +0100, Kilian Verhetsel wrote:
> 2017-11-21  Kilian Verhetsel 

Missing space before <.

> gcc/ChangeLog:
>   PR testsuite/81179
>   * tree-vect-loop.c
> (vect_create_epilog_for_reduction): Fix the returned value for

8 spaces instead of a tab, note the (vect_create_epilog_for_reduction):
should just go after space right after the filename.

>   INTEGER_INDUC_COND_REDUCTION whose last match occurred at
>   index 0.
>   (vectorizable_reduction): For INTEGER_INDUC_COND_REDUCTION,
>   pass the PHI statement that sets the induction variable to the
>   code generating the epilogue and check that no overflow will
>   occur.
> 
> gcc/testsuite/ChangeLog:
>   * gcc.dg/vect/vect-iv-cond-reduc-overflow-1.c: New test for
> overflows when compiling conditional reductions.
> * gcc.dg/vect/vect-iv-cond-reduc-overflow-2.c: Likewise.

Again, spaces instead of tab twice.

> -  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
> +  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
> +  || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
> +  == INTEGER_INDUC_COND_REDUCTION)

Formatting, == should be below STMT_VINFO on the previous line, for emacs
users perhaps even better surrounded by ()s, like:


  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
  || (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
  == INTEGER_INDUC_COND_REDUCTION))

(happens many times in the patch).

Jakub


[patch, fortran] Fix PR 83316

2017-12-08 Thread Thomas Koenig

Hello world,

the attached patch fixes a case where simplification wasn't done
correctly for maxval and minval with character variables.

Regression-tested. OK for trunk?

Regards

Thomas

2017-12-08  Thomas Koenig  

PR fortran/83316
* arith.c (gfc_character2character): New function.
* arith.h: Add prototype.
* simplify.c (gfc_convert_constant): Handle character type.

2017-12-08  Thomas Koenig  

PR fortran/83316
* gfortran.dg/minval_char_5.f90: New test.

Index: arith.c
===
--- arith.c	(Revision 255181)
+++ arith.c	(Arbeitskopie)
@@ -2514,7 +2514,19 @@ gfc_int2log (gfc_expr *src, int kind)
   return result;
 }
 
+/* Convert character to character. We only use wide strings internally,
+   so we only set the kind.  */
 
+gfc_expr *
+gfc_character2character (gfc_expr *src, int kind)
+{
+  gfc_expr *result;
+  result = gfc_copy_expr (src);
+  result->ts.kind = kind;
+
+  return result;
+}
+
 /* Helper function to set the representation in a Hollerith conversion.  
This assumes that the ts.type and ts.kind of the result have already
been set.  */
Index: arith.h
===
--- arith.h	(Revision 255181)
+++ arith.h	(Arbeitskopie)
@@ -82,6 +82,7 @@ gfc_expr *gfc_hollerith2real (gfc_expr *, int);
 gfc_expr *gfc_hollerith2complex (gfc_expr *, int);
 gfc_expr *gfc_hollerith2character (gfc_expr *, int);
 gfc_expr *gfc_hollerith2logical (gfc_expr *, int);
+gfc_expr *gfc_character2character (gfc_expr *, int);
 
 #endif /* GFC_ARITH_H  */
 
Index: simplify.c
===
--- simplify.c	(Revision 255181)
+++ simplify.c	(Arbeitskopie)
@@ -7130,6 +7130,13 @@ gfc_convert_constant (gfc_expr *e, bt type, int ki
 	}
   break;
 
+case BT_CHARACTER:
+  if (type == BT_CHARACTER)
+	f = gfc_character2character;
+  else
+	goto oops;
+  break;
+
 default:
 oops:
   gfc_internal_error ("gfc_convert_constant(): Unexpected type");
! { dg-do  run }
! PR fortran/83316 - this used to ICE
program tminmaxval
  implicit none

  character(len=*), parameter :: b = "a"
  character(len=*), parameter :: e = "c"
  character(len=*), parameter :: s(3) = (/"a", "b", "c"/)

  if (minval(s) /= b) then
call abort
  end if
  
  if (maxval(s) /= e) then
call abort
  end if

end program tminmaxval


plugin-api.h patch to add a new interface for linker plugins

2017-12-08 Thread Sriraman Tallam via gcc-patches
Hi,

   This patch was approved for binutils by Cary:
https://sourceware.org/ml/binutils/2017-12/msg00023.html

   Is it ok to apply this to GCC include/plugin-api.h ?

Thanks
Sri


Re: plugin-api.h patch to add a new interface for linker plugins

2017-12-08 Thread Sriraman Tallam via gcc-patches
Patch attached.

* plugin-api.h (ld_plugin_get_wrap_symbols): New
  plugin interface.

On Fri, Dec 8, 2017 at 11:01 AM, Sriraman Tallam  wrote:
> Hi,
>
>This patch was approved for binutils by Cary:
> https://sourceware.org/ml/binutils/2017-12/msg00023.html
>
>Is it ok to apply this to GCC include/plugin-api.h ?
>
> Thanks
> Sri
* plugin-api.h (ld_plugin_get_wrap_symbols): New
  plugin interface.

Index: include/plugin-api.h
===
--- include/plugin-api.h(revision 255515)
+++ include/plugin-api.h(working copy)
@@ -378,7 +378,15 @@
 enum ld_plugin_status
 (*ld_plugin_register_new_input) (ld_plugin_new_input_handler handler);
 
+/* The linker's interface for getting the list of wrapped symbols using the
+   --wrap option. This sets *NUM_SYMBOLS to number of wrapped symbols and
+   *WRAP_SYMBOL_LIST to the list of wrapped symbols. */
 
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_wrap_symbols) (uint64_t *num_symbols,
+   const char ***wrap_symbol_list);
+
 enum ld_plugin_level
 {
   LDPL_INFO,
@@ -422,7 +430,8 @@
   LDPT_GET_SYMBOLS_V3 = 28,
   LDPT_GET_INPUT_SECTION_ALIGNMENT = 29,
   LDPT_GET_INPUT_SECTION_SIZE = 30,
-  LDPT_REGISTER_NEW_INPUT_HOOK = 31
+  LDPT_REGISTER_NEW_INPUT_HOOK = 31,
+  LDPT_GET_WRAP_SYMBOLS = 32
 };
 
 /* The plugin transfer vector.  */
@@ -457,6 +466,7 @@
 ld_plugin_get_input_section_alignment tv_get_input_section_alignment;
 ld_plugin_get_input_section_size tv_get_input_section_size;
 ld_plugin_register_new_input tv_register_new_input;
+ld_plugin_get_wrap_symbols tv_get_wrap_symbols;
   } tv_u;
 };
 


[committed] Add testcase for already fixed issue (PR rtl-optimization/81595)

2017-12-08 Thread Jakub Jelinek
Hi!

This bug has been already fixed by Eric in r254188.
Regtested on x86_64-linux -m64/-m32, verified it FAILs with the patch reverted
on both, committed to trunk as obvious.

2017-12-08  Jakub Jelinek  

PR rtl-optimization/81595
* gcc.c-torture/compile/pr81595.c: New test.

--- gcc/testsuite/gcc.c-torture/compile/pr81595.c.jj2017-12-08 
20:02:05.823795057 +0100
+++ gcc/testsuite/gcc.c-torture/compile/pr81595.c   2017-12-08 
20:00:05.0 +0100
@@ -0,0 +1,39 @@
+/* PR rtl-optimization/81595 */
+
+void
+foo (__INTPTR_TYPE__ *x, int *y, int *z, int u, int v)
+{
+  while (u != 0)
+{
+  if (*x != 0)
+   {
+ int a = 1;
+ l1:
+ if (*y != 0)
+   {
+ while (a < 2)
+   {
+ a = 0;
+ x = (__INTPTR_TYPE__ *)&x;
+ l2:
+ ++a;
+   }
+ while (*z != 0)
+   ;
+   }
+ a /= 0;
+   }
+  else
+   {
+ *z /= (*z != 0) ? 2 : 0;
+ while (v < 1)
+   {
+ *y = 0;
+ if (v != 0)
+   goto l1;
+ ++v;
+   }
+ goto l2;
+   }
+}
+}

Jakub


Re: [AArch64] Fix ICEs in aarch64_print_operand

2017-12-08 Thread Christophe Lyon
On 8 December 2017 at 17:05, Richard Sandiford
 wrote:
> Christophe Lyon  writes:
>> Hi Richard,
>> On 7 December 2017 at 10:31, James Greenhalgh  
>> wrote>> On Tue, Dec 05, 2017 at 05:57:37PM +, Richard Sandiford wrote:
 Three related regression fixes:

 - We can't use asserts like:

 gcc_assert (GET_MODE_SIZE (mode) == 16);

   in aarch64_print_operand because it could trigger for invalid user input.

 - The output_operand_lossage in aarch64_print_address_internal:

 output_operand_lossage ("invalid operand for '%%%c'", op);

   wasn't right because "op" is an rtx_code enum rather than the
   prefix character.

 - aarch64_print_operand_address shouldn't call output_operand_lossage
   (because it doesn't have a prefix code) but instead fall back to
   output_addr_const.

 Tested on aarch64-linux-gnu.  OK to install?
>>>
>>> OK.
>>>
>>> Thanks,
>>> James
>>>

 Thanks,
 Richard


 2017-12-05  Richard Sandiford  

 gcc/
   * config/aarch64/aarch64.c (aarch64_print_address_internal): Return
   a bool success value.  Don't call output_operand_lossage here.
   (aarch64_print_ldpstp_address): Return a bool success value.
   (aarch64_print_operand_address): Call output_addr_const if
   aarch64_print_address_internal fails.
   (aarch64_print_operand): Don't assert that the mode is 16 bytes for
   'y'; call output_operand_lossage instead.  Call 
 output_operand_lossage
   if aarch64_print_ldpstp_address fails.

 gcc/testsuite/
   * gcc.target/aarch64/asm-2.c: New test.
   * gcc.target/aarch64/asm-3.c: Likewise.

>>
>> The new test gcc.target/aarch64/asm-2.c ICEs when compiled with -mabi=ilp32:
>>
>> /gcc/testsuite/gcc.target/aarch64/asm-2.c: In function 'f':
>> /gcc/testsuite/gcc.target/aarch64/asm-2.c:10:1: internal compiler
>> error: in aarch64_print_address_internal, at
>> config/aarch64/aarch64.c:5636
>> 0xf2afd3 aarch64_print_address_internal
>> /gcc/config/aarch64/aarch64.c:5636
>> 0xf2affd aarch64_print_operand_address
>> /gcc/config/aarch64/aarch64.c:5733
>> 0x7fdd43 output_address(machine_mode, rtx_def*)
>> /gcc/final.c:3913
>> 0x801288 output_asm_insn(char const*, rtx_def**)
>> /gcc/final.c:3770
>> 0x802437 final_scan_insn(rtx_insn*, _IO_FILE*, int, int, int*)
>> /gcc/final.c:2673
>> 0x802a1a final(rtx_insn*, _IO_FILE*, int)
>> /gcc/final.c:2052
>> 0x8035ab rest_of_handle_final
>> /gcc/final.c:4498
>> 0x8035ab execute
>> /gcc/final.c:4572
>>
>> Can you check?
>
> I think that's a separate preexisting problem.  Could you file a PR?
>

Sure, I filed:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83335

> Personally I'd just remove the assert, but I'm guessing that wouldn't
> be acceptable...
>
> Thanks,
> Richard


Re: [PATCH PR83320]Fix new/free mismatch issue

2017-12-08 Thread Richard Biener
On December 8, 2017 6:49:21 PM GMT+01:00, Bin Cheng  wrote:
>Hi,
>While I am still trying to reproduce and verify the issue (valgrind
>checking runs very slow for me),
>It's clear I made stupid mistake using free for newed vector.  This
>simple patch fixes it.
>Bootstrap and test ongoing.  Is it OK?

OK. 

Richard. 

>Thanks,
>bin
>2017-12-06  Bin Cheng  
>
>   PR tree-optimization/83320
>   * gimple-loop-interchange.cc (free_data_refs_with_aux): Use delete.
>   (prune_datarefs_not_in_loop): Ditto.



Re: [patch, fortran] Fix PR 83316

2017-12-08 Thread Steve Kargl
On Fri, Dec 08, 2017 at 07:42:21PM +0100, Thomas Koenig wrote:
> 
> the attached patch fixes a case where simplification wasn't done
> correctly for maxval and minval with character variables.
> 
> Regression-tested. OK for trunk?
> 

OK.

-- 
Steve


[wwwdocs] Update location of Power ELFv2 ABI Specification

2017-12-08 Thread Bill Schmidt
Hi,

I noticed the location of the Power ELFv2 ABI document was out of date,
so I committed the following change.

Thanks,
Bill

Index: htdocs/readings.html
===
RCS file: /cvs/gcc/wwwdocs/htdocs/readings.html,v
retrieving revision 1.284
diff -r1.284 readings.html
249c249
<   https://members.openpowerfoundation.org/document/dl/576";>64-Bit ELF V2 
ABI - OpenPOWER ABI
---
>href="https://openpowerfoundation.org/?resource_lib=64-bit-elf-v2-abi-specification-power-architecture";>64-Bit
>  ELF V2 ABI - OpenPOWER ABI



Re: [PATCH, rs6000] (v2) Gimple folding of splat_uX

2017-12-08 Thread Bill Schmidt
On Dec 8, 2017, at 11:08 AM, Will Schmidt  wrote:
> 
> 
> Hi,
> Add support for gimple folding of splat_u{8,16,32}.
> Testcase coverage is primarily handled by existing tests
> testsuite/gcc.target/powerpc/fold-vec-splat_*.c
> 
> One new test added to verify we continue to receive
> an 'invalid argument, must be a 5-bit immediate' error
> when we try to splat a non-constant value.
> 
> V2 updates include..
>  Use the gimple_convert() helper.
>  Use the build_vector_from_val() helper.
>  whitespace fix-ups.
> Those changes actually simplify the code here significantly, which is good. 
> :-)
> 
> Per comments from last iteration, regarding the early exit for
> arg0 != INTEGER_CST, if I remove the early exit when arg0 is not constant,
> the code appears to handle things OK, and the generated code looks
> reasonable, but this is a behavior change with respect to when folding is
> disabled, which is to error out with an ("argument 1 must be a 5-bit signed
> literal") error.  For that reason, I'm planning to leave that check in place.
> I'll defer to Bill/Segher/David, etc on whether it would be appropriate to
> change that behavior.

RIght -- the long-documented behavior of these built-ins requires a constant
value.  Until such time as we change that documentation, if we feel it's a
good idea to do so, I would like to error out for non-constant values.

Thanks,
Bill

> 
> This intrinsic only takes INTegral types, so no need to check for FLOAT_CST.
> (The other vec_splat* intrinsics will deal more with floats, those are later
> in my queue).
> 
> Regtests across assorted power systems look clean.
> OK for trunk?
> 
> Thanks,
> -Will
> 
> [gcc]
> 
> 2017-12-08  Will Schmidt  
> 
>   * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add support for
>   early folding of splat_u{8,16,32}.
> 
> [testsuite]
> 
> 2017-12-08  Will Schmidt  
> 
>   * gcc.target/powerpc/fold-vec-splat-misc-invalid.c: New.
> 
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 045a014..8972d80 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -16614,10 +16614,32 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator 
> *gsi)
> case VSX_BUILTIN_CMPLE_2DI:
> case VSX_BUILTIN_CMPLE_U2DI:
>   fold_compare_helper (gsi, LE_EXPR, stmt);
>   return true;
> 
> +/* flavors of vec_splat_[us]{8,16,32}.  */
> +case ALTIVEC_BUILTIN_VSPLTISB:
> +case ALTIVEC_BUILTIN_VSPLTISH:
> +case ALTIVEC_BUILTIN_VSPLTISW:
> +  {
> +  arg0 = gimple_call_arg (stmt, 0);
> +  lhs = gimple_call_lhs (stmt);
> +  /* Only fold the vec_splat_*() if arg0 is constant.  */
> +  if (TREE_CODE (arg0) != INTEGER_CST)
> +return false;
> +  gimple_seq stmts = NULL;
> +  location_t loc = gimple_location (stmt);
> +  tree splat_value = gimple_convert (&stmts, loc,
> + TREE_TYPE (TREE_TYPE (lhs)), arg0);
> +  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> +  tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value);
> +  g = gimple_build_assign (lhs, splat_tree);
> +  gimple_set_location (g, gimple_location (stmt));
> +  gsi_replace (gsi, g, true);
> +  return true;
> +  }
> +
> default:
>   if (TARGET_DEBUG_BUILTIN)
>   fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
>fn_code, fn_name1, fn_name2);
>   break;
> diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-splat-misc-invalid.c 
> b/gcc/testsuite/gcc.target/powerpc/fold-vec-splat-misc-invalid.c
> new file mode 100644
> index 000..20f5b05
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-splat-misc-invalid.c
> @@ -0,0 +1,33 @@
> +/* Verify that overloaded built-ins for vec_splat_s8 and vec_splat_s16
> +   generate errors as expected when we attempt to use invalid inputs.  */
> +
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +/* { dg-options "-mvsx -O2" } */
> +
> +#include 
> +
> +vector signed short
> +testss_1 (unsigned int ui)
> +{
> +  return vec_splat_s16 (ui);/* { dg-error "argument 1 must be a 5-bit signed 
> literal" } */
> +}
> +
> +vector unsigned short
> +testss_2 (signed int si)
> +{
> +  return vec_splat_u16 (si);/* { dg-error "argument 1 must be a 5-bit signed 
> literal" } */
> +}
> +
> +vector signed char
> +testsc_1 (unsigned int ui)
> +{
> +  return vec_splat_s8 (ui); /* { dg-error "argument 1 must be a 5-bit signed 
> literal" } */
> +}
> +
> +vector unsigned char
> +testsc_2 (signed int si)
> +{
> +  return vec_splat_u8 (si);/* { dg-error "argument 1 must be a 5-bit signed 
> literal" } */
> +}
> +
> 
> 



Re: [PATCH v7 4/4] Add gdb for or1k build

2017-12-08 Thread Stafford Horne
Hello,

This patch was sent in May, now everything is in place to commit the
OpenRISC gdb port upstream except for the OK from GCC on this patch.

Are there any concerns?  Who will commit this to GCC?

On Mon, May 29, 2017 at 11:48 PM, Stafford Horne  wrote:
> * ChangeLog:
>
> 2017-02-12  Stafford Horne  
>
> * configure.ac: Remove logic adding gdb to noconfigsdirs for or1k.
> * configure: Regenerate.
>
> Cc: gcc-patches@gcc.gnu.org
> ---
>  configure| 7 ---
>  configure.ac | 7 ---
>  2 files changed, 14 deletions(-)
>
> diff --git a/configure b/configure
> index be9dd89..0bf47fa 100755
> --- a/configure
> +++ b/configure
> @@ -3632,10 +3632,6 @@ case "${target}" in
>  ;;
>*-*-rtems*)
>  noconfigdirs="$noconfigdirs target-libgloss"
> -# this is not caught below because this stanza matches earlier
> -case $target in
> -  or1k*-*-*) noconfigdirs="$noconfigdirs gdb" ;;
> -esac
>  ;;
>  # The tpf target doesn't support gdb yet.
>*-*-tpf*)
> @@ -3841,9 +3837,6 @@ case "${target}" in
>nvptx*-*-*)
>  noconfigdirs="$noconfigdirs target-libssp target-libstdc++-v3 
> target-libobjc"
>  ;;
> -  or1k*-*-*)
> -noconfigdirs="$noconfigdirs gdb"
> -;;
>sh-*-*)
>  case "${target}" in
>sh*-*-elf)
> diff --git a/configure.ac b/configure.ac
> index 532c5c2..9d16792 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -966,10 +966,6 @@ case "${target}" in
>  ;;
>*-*-rtems*)
>  noconfigdirs="$noconfigdirs target-libgloss"
> -# this is not caught below because this stanza matches earlier
> -case $target in
> -  or1k*-*-*) noconfigdirs="$noconfigdirs gdb" ;;
> -esac
>  ;;
>  # The tpf target doesn't support gdb yet.
>*-*-tpf*)
> @@ -1175,9 +1171,6 @@ case "${target}" in
>nvptx*-*-*)
>  noconfigdirs="$noconfigdirs target-libssp target-libstdc++-v3 
> target-libobjc"
>  ;;
> -  or1k*-*-*)
> -noconfigdirs="$noconfigdirs gdb"
> -;;
>sh-*-*)
>  case "${target}" in
>sh*-*-elf)
> --
> 2.9.4
>


Re: Avoid -Werror=format-overflow error in dbxout.c (dbxout_block) on Solaris/SPARC

2017-12-08 Thread David Malcolm
On Mon, 2017-12-04 at 18:05 -0700, Martin Sebor wrote:
> On 12/04/2017 01:58 PM, David Malcolm wrote:
> > On Mon, 2017-12-04 at 12:18 -0700, Martin Sebor wrote:
> > > On 12/04/2017 05:41 AM, Rainer Orth wrote:
> > > > Within test last week, 64-bit Solaris/SPARC bootstrap began to
> > > > fail:
> > > > 
> > > > /vol/gcc/src/hg/trunk/local/gcc/dbxout.c: In function 'bool
> > > > dbxout_block(tree, int, tree, int)':
> > > > /vol/gcc/src/hg/trunk/local/gcc/dbxout.c:3767:1: error: '%lu'
> > > > directive writing between 1 and 20 bytes into a region of size
> > > > 14
> > > > [-Werror=format-overflow=]
> > > >  dbxout_block (tree block, int depth, tree args, int
> > > > parent_blocknum)
> > > >  ^~~~
> > > > /vol/gcc/src/hg/trunk/local/gcc/dbxout.c:3767:1: note:
> > > > directive
> > > > argument in the range [0, 18446744073709551614]
> > > > In file included from ./tm.h:26,
> > > >  from
> > > > /vol/gcc/src/hg/trunk/local/gcc/target.h:52,
> > > >  from
> > > > /vol/gcc/src/hg/trunk/local/gcc/dbxout.c:72:
> > > > /vol/gcc/src/hg/trunk/local/gcc/config/sparc/sol2.h:353:11:
> > > > note:
> > > > 'std::sprintf' output between 8 and 27 bytes into a destination
> > > > of
> > > > size 20
> > > >sprintf ((LABEL), "*.L%s%lu", (PREFIX), (unsigned
> > > > long)(NUM))
> > > >^~~~
> > > > ~
> > > > /vol/gcc/src/hg/trunk/local/gcc/dbxout.c:3855:5: note: in
> > > > expansion
> > > > of macro 'ASM_GENERATE_INTERNAL_LABEL'
> > > >  ASM_GENERATE_INTERNAL_LABEL (buf, "LBB", parent_blocknum);
> > > >  ^~~
> > > > 
> > > > The line numbers are extremely confusing, to say the least,
> > > > though:
> > > > the
> > > > one in the error and the first note refer to the begin of the
> > > > function
> > > > definition and only the third note refers to the line of the
> > > > actual
> > > > error.
> > > 
> > > I agree it looks confusing.  It's the result of the interaction
> > > between macro tracking and inlining.
> > > 
> > > I think it's a general problem that affects many (though not all)
> > > warnings emitted out of the middle-end.  For instance, the
> > > example
> > > below results in similar output.  The output changes depending on
> > > whether or not _FORTIFY_SOURCE is defined.
> > > 
> > > #include 
> > > 
> > > #define FOO(d, s) strcpy (d, s)
> > > 
> > > void foo (char *d, const char *s) { FOO (d, s); }
> > > 
> > > void sink (char*);
> > > 
> > > void bar (void)
> > > {
> > >char a[3];
> > > 
> > >foo (a, "1234567");   // bug here
> > > 
> > >sink (a);
> > > }
> > > 
> > > Without _FORTIFY_SOURCE:
> > > 
> > > d.c: In function ‘void bar()’:
> > > d.c:3:26: warning: ‘void* __builtin_memcpy(void*, const void*,
> > > long
> > > unsigned int)’ writing 8 bytes into a region of size 3 overflows
> > > the
> > > destination [-Wstringop-overflow=]
> > >   #define FOO(d, s) strcpy (d, s)
> > > ~~~^~
> > > d.c:5:37: note: in expansion of macro ‘FOO’
> > >   void foo (char *d, const char *s) { FOO (d, s); }
> > >   ^~~
> > > 
> > > If bar() were a bigger function with multiple calls to foo() it
> > > could be tricky to tell which of them caused the warning.
> > > 
> > > With _FORTIFY_SOURCE:
> > > 
> > > In file included from /usr/include/string.h:635,
> > >   from d.c:1:
> > > In function ‘char* strcpy(char*, const char*)’,
> > >  inlined from ‘void bar()’ at d.c:5:37:
> > > /usr/include/bits/string3.h:110:33: warning: ‘void*
> > > __builtin___memcpy_chk(void*, const void*, long unsigned int,
> > > long
> > > unsigned int)’ writing 8 bytes into a region of size 3 overflows
> > > the
> > > destination [-Wstringop-overflow=]
> > > return __builtin___strcpy_chk (__dest, __src, __bos
> > > (__dest));
> > >~~~^~~
> > > 
> > > This suffers from the same problem as the first case: the line
> > > number of the offending call in bar() is missing.
> > > 
> > > In both cases the problem is compounded by the diagnostic, as
> > > a result of folding, referring to __builtin___memcpy_chk while
> > > the source code contains a call to strcpy.
> > > 
> > > I don't know nearly enough about the internals of the diagnostic
> > > subsystem to tell how difficult it might be to change the output
> > > to make it more readable.  David Malcolm is the expert on this
> > > area so he might have an idea what it would take.
> > 
> > [Did you mean to CC me on this, rather than dje?]
> 
> I meant you but I'm interested in all expert opinions/suggestions.
> 
> > I'm not as familiar as I could be on the existing inlining-tracking
> > implementation - so much so that, in ignorance, I wrote my own
> > version
> > of it earlier this year (doh!).
> > 
> > The warning implementation reads:
> > 
> > 3151warning_at (loc, opt,
> > 3152   

Re: Avoid -Werror=format-overflow error in dbxout.c (dbxout_block) on Solaris/SPARC

2017-12-08 Thread Martin Sebor

On 12/08/2017 02:34 PM, David Malcolm wrote:

On Mon, 2017-12-04 at 18:05 -0700, Martin Sebor wrote:

On 12/04/2017 01:58 PM, David Malcolm wrote:

On Mon, 2017-12-04 at 12:18 -0700, Martin Sebor wrote:

On 12/04/2017 05:41 AM, Rainer Orth wrote:

Within test last week, 64-bit Solaris/SPARC bootstrap began to
fail:

/vol/gcc/src/hg/trunk/local/gcc/dbxout.c: In function 'bool
dbxout_block(tree, int, tree, int)':
/vol/gcc/src/hg/trunk/local/gcc/dbxout.c:3767:1: error: '%lu'
directive writing between 1 and 20 bytes into a region of size
14
[-Werror=format-overflow=]
 dbxout_block (tree block, int depth, tree args, int
parent_blocknum)
 ^~~~
/vol/gcc/src/hg/trunk/local/gcc/dbxout.c:3767:1: note:
directive
argument in the range [0, 18446744073709551614]
In file included from ./tm.h:26,
 from
/vol/gcc/src/hg/trunk/local/gcc/target.h:52,
 from
/vol/gcc/src/hg/trunk/local/gcc/dbxout.c:72:
/vol/gcc/src/hg/trunk/local/gcc/config/sparc/sol2.h:353:11:
note:
'std::sprintf' output between 8 and 27 bytes into a destination
of
size 20
   sprintf ((LABEL), "*.L%s%lu", (PREFIX), (unsigned
long)(NUM))
   ^~~~
~
/vol/gcc/src/hg/trunk/local/gcc/dbxout.c:3855:5: note: in
expansion
of macro 'ASM_GENERATE_INTERNAL_LABEL'
 ASM_GENERATE_INTERNAL_LABEL (buf, "LBB", parent_blocknum);
 ^~~

The line numbers are extremely confusing, to say the least,
though:
the
one in the error and the first note refer to the begin of the
function
definition and only the third note refers to the line of the
actual
error.


I agree it looks confusing.  It's the result of the interaction
between macro tracking and inlining.

I think it's a general problem that affects many (though not all)
warnings emitted out of the middle-end.  For instance, the
example
below results in similar output.  The output changes depending on
whether or not _FORTIFY_SOURCE is defined.

#include 

#define FOO(d, s) strcpy (d, s)

void foo (char *d, const char *s) { FOO (d, s); }

void sink (char*);

void bar (void)
{
   char a[3];

   foo (a, "1234567");   // bug here

   sink (a);
}

Without _FORTIFY_SOURCE:

d.c: In function ‘void bar()’:
d.c:3:26: warning: ‘void* __builtin_memcpy(void*, const void*,
long
unsigned int)’ writing 8 bytes into a region of size 3 overflows
the
destination [-Wstringop-overflow=]
  #define FOO(d, s) strcpy (d, s)
~~~^~
d.c:5:37: note: in expansion of macro ‘FOO’
  void foo (char *d, const char *s) { FOO (d, s); }
  ^~~

If bar() were a bigger function with multiple calls to foo() it
could be tricky to tell which of them caused the warning.

With _FORTIFY_SOURCE:

In file included from /usr/include/string.h:635,
  from d.c:1:
In function ‘char* strcpy(char*, const char*)’,
 inlined from ‘void bar()’ at d.c:5:37:
/usr/include/bits/string3.h:110:33: warning: ‘void*
__builtin___memcpy_chk(void*, const void*, long unsigned int,
long
unsigned int)’ writing 8 bytes into a region of size 3 overflows
the
destination [-Wstringop-overflow=]
return __builtin___strcpy_chk (__dest, __src, __bos
(__dest));
   ~~~^~~

This suffers from the same problem as the first case: the line
number of the offending call in bar() is missing.

In both cases the problem is compounded by the diagnostic, as
a result of folding, referring to __builtin___memcpy_chk while
the source code contains a call to strcpy.

I don't know nearly enough about the internals of the diagnostic
subsystem to tell how difficult it might be to change the output
to make it more readable.  David Malcolm is the expert on this
area so he might have an idea what it would take.


[Did you mean to CC me on this, rather than dje?]


I meant you but I'm interested in all expert opinions/suggestions.


I'm not as familiar as I could be on the existing inlining-tracking
implementation - so much so that, in ignorance, I wrote my own
version
of it earlier this year (doh!).

The warning implementation reads:

3151warning_at (loc, opt,
3152(integer_onep (range[0])
3153 ? G_("%K%qD writing %E byte
into a region "
3154  "of size %E overflows the
destination")
3155 : G_("%K%qD writing %E bytes
into a region "
3156  "of size %E overflows the
destination")),
3157exp, get_callee_fndecl (exp),
range[0], objsize);


The inlining information is coming from that "%K" in the string,
which
leads to tree-pretty-print.c's percent_K_format being called for
"exp".
 This overwrites the "loc" provided for the warning_at with
EXPR_LOCATION of exp, and looks at TREE_BLOCK (exp), finding the
outermost containing block within its function, and writing it into
the
diagnostic_info's 

patch to fix PR83317

2017-12-08 Thread Vladimir Makarov

  The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83317

  The patch was successfully boostrapped and tested on x86-64.

  Committed as rev. 255517

Index: ChangeLog
===
--- ChangeLog	(revision 255516)
+++ ChangeLog	(working copy)
@@ -1,3 +1,8 @@
+2017-12-08  Vladimir Makarov  
+
+	PR rtl-optimization/83317
+	* lra-constraints.c (process_address_1): Add insn code check.
+
 2017-12-08  Michael Matz  
 
 	Fix PR tree-optimization/83323
Index: lra-constraints.c
===
--- lra-constraints.c	(revision 255470)
+++ lra-constraints.c	(working copy)
@@ -3222,7 +3222,8 @@ process_address_1 (int nop, bool check_o
   /* Do not attempt to decompose arbitrary addresses generated by combine
  for asm operands with loose constraints, e.g 'X'.  */
   else if (MEM_P (op)
-	   && !(get_constraint_type (cn) == CT_FIXED_FORM
+	   && !(INSN_CODE (curr_insn) < 0
+		&& get_constraint_type (cn) == CT_FIXED_FORM
 	&& constraint_satisfied_p (op, cn)))
 decompose_mem_address (&ad, op);
   else if (GET_CODE (op) == SUBREG
Index: testsuite/ChangeLog
===
--- testsuite/ChangeLog	(revision 255516)
+++ testsuite/ChangeLog	(working copy)
@@ -1,5 +1,10 @@
 2017-12-08  Jakub Jelinek  
 
+	PR rtl-optimization/83317
+	* gcc.target/i386/pr83317.c: New test.
+
+2017-12-08  Jakub Jelinek  
+
 	PR rtl-optimization/81595
 	* gcc.c-torture/compile/pr81595.c: New test.
 
Index: testsuite/gcc.target/i386/pr83317.c
===
--- testsuite/gcc.target/i386/pr83317.c	(nonexistent)
+++ testsuite/gcc.target/i386/pr83317.c	(working copy)
@@ -0,0 +1,21 @@
+/* PR rtl-optimization/83317 */
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+/* { dg-additional-options "-fPIC" { target fpic } } */
+/* { dg-additional-options "-msse2 -mfpmath=sse" { target ia32 } } */
+
+struct S { double a; };
+struct S c;
+int d, e;
+void *buf[64];
+extern int setjmp (void **);
+
+void
+foo ()
+{
+  setjmp (buf);
+  struct S g;
+  if (d)
+g.a = __builtin_copysign (e, d);
+  c = g;
+}


[PATCH] PR fortran/82934,83318 -- Enforce F2008:C631

2017-12-08 Thread Steve Kargl
The attached patch enforces F2008:C631, which of course is

/* F2008:C631 (R626) A type-param-value in a type-spec shall be an
   asterisk if and only if each allocate-object is a dummy argument
   for which the corresponding type parameter is assumed.  */

Regression tested on x86_64-*-freebsd.

2017-12-08  Steven G. Kargl  

PR fortran/82934
PR fortran/83318
* match.c (gfc_match_allocate): Enforce F2008:C631.

2017-12-08  Steven G. Kargl  

PR fortran/82934
PR fortran/83318
* gfortran.dg/allocate_assumed_charlen_2.f90: new test.

-- 
Steve
Index: gcc/fortran/match.c
===
--- gcc/fortran/match.c	(revision 255517)
+++ gcc/fortran/match.c	(working copy)
@@ -3960,9 +3960,9 @@ gfc_match_allocate (void)
   gfc_typespec ts;
   gfc_symbol *sym;
   match m;
-  locus old_locus, deferred_locus;
+  locus old_locus, deferred_locus, assumed_locus;
   bool saw_stat, saw_errmsg, saw_source, saw_mold, saw_deferred, b1, b2, b3;
-  bool saw_unlimited = false;
+  bool saw_unlimited = false, saw_assumed = false;
 
   head = tail = NULL;
   stat = errmsg = source = mold = tmp = NULL;
@@ -3993,6 +3993,9 @@ gfc_match_allocate (void)
 }
   else
 {
+  /* Needed for the F2008:C631 check below. */
+  assumed_locus = gfc_current_locus;
+
   if (gfc_match (" :: ") == MATCH_YES)
 	{
 	  if (!gfc_notify_std (GFC_STD_F2003, "typespec in ALLOCATE at %L",
@@ -4007,16 +4010,11 @@ gfc_match_allocate (void)
 	}
 
 	  if (ts.type == BT_CHARACTER)
-	ts.u.cl->length_from_typespec = true;
-
-	  /* TODO understand why this error does not appear but, instead,
-	 the derived type is caught as a variable in primary.c.  */
-	  if (gfc_spec_list_type (type_param_spec_list, NULL) != SPEC_EXPLICIT)
 	{
-	  gfc_error ("The type parameter spec list in the type-spec at "
-			 "%L cannot contain ASSUMED or DEFERRED parameters",
-			 &old_locus);
-	  goto cleanup;
+	  if (!ts.u.cl->length)
+		saw_assumed = true;
+	  else
+		ts.u.cl->length_from_typespec = true;
 	}
 	}
   else
@@ -4054,6 +4052,17 @@ gfc_match_allocate (void)
 
   if (impure)
 	gfc_unset_implicit_pure (NULL);
+
+  /* F2008:C631 (R626) A type-param-value in a type-spec shall be an
+	 asterisk if and only if each allocate-object is a dummy argument
+	 for which the corresponding type parameter is assumed.  */
+  if (saw_assumed
+	  && (tail->expr->ts.deferred || tail->expr->ts.u.cl->length))
+	{
+	  gfc_error ("Incompatible allocate-object at %C for CHARACTER "
+		 "type-spec at %L", &assumed_locus);
+	  goto cleanup;
+	}
 
   if (tail->expr->ts.deferred)
 	{
Index: gcc/testsuite/gfortran.dg/allocate_assumed_charlen_2.f90
===
--- gcc/testsuite/gfortran.dg/allocate_assumed_charlen_2.f90	(nonexistent)
+++ gcc/testsuite/gfortran.dg/allocate_assumed_charlen_2.f90	(working copy)
@@ -0,0 +1,20 @@
+! { dg-do compile }
+! PR fortran/82934
+! PR fortran/83318
+program a
+ character(len=42), allocatable :: f
+ character(len=22), allocatable :: ff
+ call alloc(f, ff)
+ if (len(f) .ne. 42) call abort
+ if (len(ff) .ne. 22) call abort
+contains
+ subroutine alloc( a, b )
+  character(len=*), allocatable  :: a
+  character(len=22), allocatable :: b
+  character(len=:), allocatable :: c
+  character, allocatable :: d
+  allocate(character(len=*)::a,b) ! { dg-error "Incompatible allocate-object" }
+  allocate(character(len=*)::c)   ! { dg-error "Incompatible allocate-object" }
+  allocate(character(len=*)::d)   ! { dg-error "Incompatible allocate-object" }
+ end subroutine
+end program a


[PATCH] Add .type and .size directives to riscv libgcc functions.

2017-12-08 Thread Jim Wilson
This rewrites the riscv libgcc function to use macros for function
definitions, to add the missing type and size directives, and to make
future changes easier.

This was tested with make check with rv32i and rv64i builds, with and
without -msave-restore, to force all of the modified functions to be used
in testing.  There were no regressions.  Committed.

libgcc/
* config/riscv/div.S: Use FUNC_* macros.
* config/riscv/muldi3.S, config/riscv/multi3.S: Likewise
* config/riscv/save-restore.S: Likewise.
* config/riscv/riscv-asm.h: New.
---
 libgcc/config/riscv/div.S  |  33 +++
 libgcc/config/riscv/muldi3.S   |   6 +-
 libgcc/config/riscv/multi3.S   |  13 ++-
 libgcc/config/riscv/riscv-asm.h|  35 +++
 libgcc/config/riscv/save-restore.S | 190 +
 5 files changed, 175 insertions(+), 102 deletions(-)
 create mode 100644 libgcc/config/riscv/riscv-asm.h

diff --git a/libgcc/config/riscv/div.S b/libgcc/config/riscv/div.S
index 63d542e846c..4366c5ce1dc 100644
--- a/libgcc/config/riscv/div.S
+++ b/libgcc/config/riscv/div.S
@@ -23,6 +23,8 @@ a copy of the GCC Runtime Library Exception along with this 
program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 .  */
 
+#include "riscv-asm.h"
+
   .text
   .align 2
 
@@ -33,8 +35,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 # define __divdi3 __divsi3
 # define __moddi3 __modsi3
 #else
-  .globl __udivsi3
-__udivsi3:
+FUNC_BEGIN (__udivsi3)
   /* Compute __udivdi3(a0 << 32, a1 << 32); cast result to uint32_t.  */
   slla0, a0, 32
   slla1, a1, 32
@@ -42,9 +43,9 @@ __udivsi3:
   jal__udivdi3
   sext.w a0, a0
   jr t0
+FUNC_END (__udivsi3)
 
-  .globl __umodsi3
-__umodsi3:
+FUNC_BEGIN (__umodsi3)
   /* Compute __udivdi3((uint32_t)a0, (uint32_t)a1); cast a1 to uint32_t.  */
   slla0, a0, 32
   slla1, a1, 32
@@ -54,25 +55,22 @@ __umodsi3:
   jal__udivdi3
   sext.w a0, a1
   jr t0
+FUNC_END (__umodsi3)
 
-  .globl __modsi3
-  __modsi3 = __moddi3
+FUNC_ALIAS (__modsi3, __moddi3)
 
-  .globl __divsi3
-__divsi3:
+FUNC_BEGIN( __divsi3)
   /* Check for special case of INT_MIN/-1. Otherwise, fall into __divdi3.  */
   lit0, -1
   beq   a1, t0, .L20
 #endif
 
-  .globl __divdi3
-__divdi3:
+FUNC_BEGIN (__divdi3)
   bltz  a0, .L10
   bltz  a1, .L11
   /* Since the quotient is positive, fall into __udivdi3.  */
 
-  .globl __udivdi3
-__udivdi3:
+FUNC_BEGIN (__udivdi3)
   mva2, a1
   mva1, a0
   lia0, -1
@@ -96,14 +94,15 @@ __udivdi3:
   bnez  a3, .L3
 .L5:
   ret
+FUNC_END (__udivdi3)
 
-  .globl __umoddi3
-__umoddi3:
+FUNC_BEGIN (__umoddi3)
   /* Call __udivdi3(a0, a1), then return the remainder, which is in a1.  */
   move  t0, ra
   jal   __udivdi3
   move  a0, a1
   jrt0
+FUNC_END (__umoddi3)
 
   /* Handle negative arguments to __divdi3.  */
 .L10:
@@ -118,9 +117,9 @@ __umoddi3:
   jal   __udivdi3
   neg   a0, a0
   jrt0
+FUNC_END (__divdi3)
 
-  .globl __moddi3
-__moddi3:
+FUNC_BEGIN (__moddi3)
   move   t0, ra
   bltz   a1, .L31
   bltz   a0, .L32
@@ -136,6 +135,7 @@ __moddi3:
   jal__udivdi3/* The dividend is hella negative.  */
   nega0, a1
   jr t0
+FUNC_END (__moddi3)
 
 #if __riscv_xlen == 64
   /* continuation of __divsi3 */
@@ -143,4 +143,5 @@ __moddi3:
   sll   t0, t0, 31
   bne   a0, t0, __divdi3
   ret
+FUNC_END (__divsi3)
 #endif
diff --git a/libgcc/config/riscv/muldi3.S b/libgcc/config/riscv/muldi3.S
index eb3d9b0df3d..7c07878eea9 100644
--- a/libgcc/config/riscv/muldi3.S
+++ b/libgcc/config/riscv/muldi3.S
@@ -23,6 +23,8 @@ a copy of the GCC Runtime Library Exception along with this 
program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 .  */
 
+#include "riscv-asm.h"
+
   .text
   .align 2
 
@@ -31,8 +33,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 # define __muldi3 __mulsi3
 #endif
 
-  .globl __muldi3
-__muldi3:
+FUNC_BEGIN (__muldi3)
   mv a2, a0
   li a0, 0
 .L1:
@@ -44,3 +45,4 @@ __muldi3:
   slli   a2, a2, 1
   bnez   a1, .L1
   ret
+FUNC_END (__muldi3)
diff --git a/libgcc/config/riscv/multi3.S b/libgcc/config/riscv/multi3.S
index 4d454e65013..a3b89c65206 100644
--- a/libgcc/config/riscv/multi3.S
+++ b/libgcc/config/riscv/multi3.S
@@ -23,6 +23,8 @@ a copy of the GCC Runtime Library Exception along with this 
program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 .  */
 
+#include "riscv-asm.h"
+
   .text
   .align 2
 
@@ -31,8 +33,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 # define __multi3 __muldi3
 #endif
 
-  .globl __multi3
-__multi3:
+FUNC_BEGIN (__multi3)
 
 #if __riscv_xlen == 32
 /* Our RV64 64-bit routines are equivalent to our RV32 32-bit routines.  */
@@ -79,3 +80,11 @@ __multi3:
   mv  a0, t2
   mv  a1, t4
   jr