[PATCH] Fix several AVX512 intrinsic mask arguments.

2018-07-05 Thread Grazvydas Ignotas
gcc/ChangeLog:

2018-07-05  Grazvydas Ignotas  

* config/i386/avx512bwintrin.h: (_mm512_mask_cmp_epi8_mask,
_mm512_mask_cmp_epu8_mask): Fix mask arguments.
---
 gcc/config/i386/avx512bwintrin.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h
index bd389fa..24ad5f1 100644
--- a/gcc/config/i386/avx512bwintrin.h
+++ b/gcc/config/i386/avx512bwintrin.h
@@ -3043,7 +3043,7 @@ _mm512_cmp_epi16_mask (__m512i __X, __m512i __Y, const 
int __P)
 
 extern __inline __mmask64
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_mask_cmp_epi8_mask (__mmask32 __U, __m512i __X, __m512i __Y,
+_mm512_mask_cmp_epi8_mask (__mmask64 __U, __m512i __X, __m512i __Y,
   const int __P)
 {
   return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
@@ -3081,7 +3081,7 @@ _mm512_cmp_epu16_mask (__m512i __X, __m512i __Y, const 
int __P)
 
 extern __inline __mmask64
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_mask_cmp_epu8_mask (__mmask32 __U, __m512i __X, __m512i __Y,
+_mm512_mask_cmp_epu8_mask (__mmask64 __U, __m512i __X, __m512i __Y,
   const int __P)
 {
   return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
-- 
2.7.4



Re: [PATCH] Fix several AVX512 intrinsic mask arguments.

2018-07-05 Thread Grazvydas Ignotas
On Thu, Jul 5, 2018 at 9:28 PM, Jakub Jelinek  wrote:
> On Thu, Jul 05, 2018 at 08:30:27PM +0300, Grazvydas Ignotas wrote:
>> gcc/ChangeLog:
>>
>> 2018-07-05  Grazvydas Ignotas  
>>
>>   * config/i386/avx512bwintrin.h: (_mm512_mask_cmp_epi8_mask,
>>   _mm512_mask_cmp_epu8_mask): Fix mask arguments.
>
> LGTM, but
> 1) I think it would be nice to add a runtime testcase that fails (on avx512bw 
> hw)
>without this patch and succeeds with this patch (have some non-zero and
>zero bits in the high 32 bits of the mask and test that the result is
>correct

Looks like the existing tests can already do it if we correct an
apparent mistake (see attached patch).

> 2) there are other functions that have this bug, e.g.
>_mm_mask_cmp_epi8_mask, _mm256_mask_cmp_epi8_mask,
>_mm_mask_cmp_epu8_mask, _mm256_mask_cmp_epu8_mask in avx512vlbwintrin.h
>
> Let's grep for all suspicious parts:
> echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 
> 's/@@@/}\n/g' | grep '__mmask8.*__mmask\(16\|32\|64\)'
>  _mm512_mask_bitshuffle_epi64_mask (__mmask8 __M, __m512i __A, __m512i __B) { 
> return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, 
> (__v64qi) __B, (__mmask64) __M); }
>  _mm_mask_cmp_epi8_mask (__mmask8 __U, __m128i __X, __m128i __Y, const int 
> __P) { return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, 
> (__v16qi) __Y, __P, (__mmask16) __U); }
>  _mm_mask_cmp_epu8_mask (__mmask8 __U, __m128i __X, __m128i __Y, const int 
> __P) { return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, 
> (__v16qi) __Y, __P, (__mmask16) __U); }
> echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 
> 's/@@@/}\n/g' | grep '__mmask16.*__mmask\(8\|32\|64\)'
>  _mm512_mask_xor_epi64 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 
> { return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, 
> (__v8di) __W, (__mmask8) __U); }
>  _mm512_maskz_xor_epi64 (__mmask16 __U, __m512i __A, __m512i __B) { return 
> (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, (__v8di) 
> _mm512_setzero_si512 (), (__mmask8) __U); }
>  _mm512_mask_cmpneq_epi64_mask (__mmask16 __M, __m512i __X, __m512i __Y) { 
> return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, (__v8di) __Y, 4, 
> (__mmask8) __M); }
>  _mm256_mask_cmp_epi8_mask (__mmask16 __U, __m256i __X, __m256i __Y, const 
> int __P) { return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, 
> (__v32qi) __Y, __P, (__mmask32) __U); }
>  _mm256_mask_cmp_epu8_mask (__mmask16 __U, __m256i __X, __m256i __Y, const 
> int __P) { return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, 
> (__v32qi) __Y, __P, (__mmask32) __U); }
>  _mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { return 
> (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) 
> __W, (__mmask8) __U); }
>  _mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) { return (__m128) 
> __builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) 
> _mm_setzero_ps (), (__mmask8) __U); }
>  _mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { 
> return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, 
> (__v8sf) __W, (__mmask8) __U); }
>  _mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) { return 
> (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) 
> _mm256_setzero_ps (), (__mmask8) __U); }
>  _mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { return 
> (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) 
> __W, (__mmask8) __U); }
>  _mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) { return (__m128) 
> __builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) 
> _mm_setzero_ps (), (__mmask8) __U); }
>  _mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { 
> return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, 
> (__v8sf) __W, (__mmask8) __U); }
>  _mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) { return 
> (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) 
> _mm256_setzero_ps (), (__mmask8) __U); }
>  _mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) { return (__m256) 
> __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, (__v8sf) _mm256_setzero_ps (), 
> (__mmask8) __U); }
>  _mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) { return (__m128) 
> __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, (__v4sf) _mm_setzero_ps (), 
> (__mmask8) __U); }
> echo `sed -n '/^_mm.*__mmask/,/^}/p' c

Re: [PATCH] Fix __mmask* types on many AVX512 intrinsics

2018-07-07 Thread Grazvydas Ignotas
On Sat, Jul 7, 2018 at 11:15 AM, Jakub Jelinek  wrote:
> Hi!
>
> On Fri, Jul 06, 2018 at 12:47:07PM +0200, Jakub Jelinek wrote:
>> On Thu, Jul 05, 2018 at 11:57:26PM +0300, Grazvydas Ignotas wrote:
>> > I think it would be more efficient if you took care of it. I won't
>> > have time for at least a few days anyway.
>
> Here is the complete patch, I found two further issues where
> the __mmask mismatch was in between the return type and what was used
> in the rest of the intrinsic, so not caught by my earlier greps.
>
> I've added (except for the avx512bitalg which seems to have no runtime
> test coverage whatsoever) tests that cover the real bugs and further
> fixed the avx512*-vpcmp{,u}b-2.c test because (rel) << i triggered UB
> if i could go up to 63.
>
> I don't have AVX512* hw, so I've just bootstrapped/regtested the patch
> normally on i686-linux and x86_64-linux AVX2 hw and tried the affected
> tests without the config/i386/ changes and with them under SDE.
> The patch should fix these FAILs:
>
> FAIL: gcc.target/i386/avx512bw-vpcmpb-2.c execution test
> FAIL: gcc.target/i386/avx512bw-vpcmpub-2.c execution test
> FAIL: gcc.target/i386/avx512f-vinsertf32x4-3.c execution test
> FAIL: gcc.target/i386/avx512f-vinserti32x4-3.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpb-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpgeb-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpgeub-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpgeuw-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpgew-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpleb-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpleub-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpleuw-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmplew-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpltb-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpltub-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpltuw-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpltw-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpneqb-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpnequb-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpnequw-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpneqw-2.c execution test
> FAIL: gcc.target/i386/avx512vl-vpcmpub-2.c execution test
>
> Ok for trunk?
>
> I guess we want to backport it soon, but would appreciate somebody testing
> it on real AVX512-{BW,VL} hw before doing the backports.

I've run the testsuite with this patch applied and all tests passed on
i7-7800X. There are avx512vl-vmovdqa64-1.c and avx512vl-vpermilpdi-1.c
failures, but those seem unrelated.

thanks,
Gražvydas


Re: [PATCH] Fix __mmask* types on many AVX512 intrinsics

2018-07-09 Thread Grazvydas Ignotas
On Mon, Jul 9, 2018 at 10:37 AM, Jakub Jelinek  wrote:
> On Sun, Jul 08, 2018 at 02:39:40AM +0300, Grazvydas Ignotas wrote:
>> > I guess we want to backport it soon, but would appreciate somebody testing
>> > it on real AVX512-{BW,VL} hw before doing the backports.
>>
>> I've run the testsuite with this patch applied and all tests passed on
>> i7-7800X.
>
> Thanks for the testing.
>
>> There are avx512vl-vmovdqa64-1.c and avx512vl-vpermilpdi-1.c
>> failures, but those seem unrelated.
>
> These are dg-do compile tests, and they PASS for me, even when doing
> make check-gcc RUNTESTFLAGS="--target_board=unix/-march=skylake-avx512 
> i386.exp='avx512vl-vmovdqa64-1.c avx512vl-vpermilpdi-1.c'"
> So, how exactly you've configured your gcc, what kind of options are
> passed to the test and how they FAIL?

I should've mentioned I've tested this patch on top of 8.1 release
tarball and used crosstool-NG to build the toolchain with it's "GCC
test suite" option enabled. It looks like crosstool is applying some
patches, so the results might not be valid. Here is the log (seems to
contain the configuration info), where I just grepped for FAIL and the
new test names to see if they were actually run:

http://notaz.gp2x.de/misc/unsorted/gcc.log.xz

Gražvydas


[PATCH][4.9/5 Backport] PR rtl-optimization/67037 Use copy_rtx when necessary

2015-10-28 Thread Grazvydas Ignotas
Hi,

This is the 4.9 and GCC 5 backport of patch from PR67037 that's already in 
trunk.
I've build it on 4.9 and confirmed that it works.

Grazvydas

Backport from mainline
2015-09-30  Bernd Edlinger  

PR rtl-optimization/67037
* lra-constraints.c (process_addr_reg): Use copy_rtx when necessary.

testsuite:
2015-09-30  Bernd Edlinger  

PR rtl-optimization/67037
* gcc.c-torture/execute/pr67037.c: New test.
---
 gcc/lra-constraints.c |  2 +-
 gcc/testsuite/gcc.c-torture/execute/pr67037.c | 49 +++
 2 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr67037.c

diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c
index ae8f3cd..919b127 100644
--- a/gcc/lra-constraints.c
+++ b/gcc/lra-constraints.c
@@ -1203,7 +1203,7 @@ process_addr_reg (rtx *loc, rtx *before, rtx *after, enum 
reg_class cl)
   if (after != NULL)
 {
   start_sequence ();
-  lra_emit_move (reg, new_reg);
+  lra_emit_move (before_p ? copy_rtx (reg) : reg, new_reg);
   emit_insn (*after);
   *after = get_insns ();
   end_sequence ();
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr67037.c 
b/gcc/testsuite/gcc.c-torture/execute/pr67037.c
new file mode 100644
index 000..3119d32
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr67037.c
@@ -0,0 +1,49 @@
+long (*extfunc)();
+
+static inline void lstrcpynW( short *d, const short *s, int n )
+{
+unsigned int count = n;
+
+while ((count > 1) && *s)
+{
+count--;
+*d++ = *s++;
+}
+if (count) *d = 0;
+}
+
+int __attribute__((noinline,noclone))
+badfunc(int u0, int u1, int u2, int u3,
+  short *fsname, unsigned int fsname_len)
+{
+static const short ntfsW[] = {'N','T','F','S',0};
+char superblock[2048+3300];
+int ret = 0;
+short *p;
+
+if (extfunc())
+return 0;
+p = (void *)extfunc();
+if (p != 0)
+goto done;
+
+extfunc(superblock);
+
+lstrcpynW(fsname, ntfsW, fsname_len);
+
+ret = 1;
+done:
+return ret;
+}
+
+static long f()
+{
+return 0;
+}
+
+int main()
+{
+short buf[6];
+extfunc = f;
+return !badfunc(0, 0, 0, 0, buf, 6);
+}
-- 
1.9.1



Re: [PATCH][4.9/5 Backport] PR rtl-optimization/67037 Use copy_rtx when necessary

2015-11-04 Thread Grazvydas Ignotas
Can anyone commit this, please?

On Thu, Oct 29, 2015 at 12:48 AM, Grazvydas Ignotas  wrote:
> Hi,
>
> This is the 4.9 and GCC 5 backport of patch from PR67037 that's already in 
> trunk.
> I've build it on 4.9 and confirmed that it works.
>
> Grazvydas
>
> Backport from mainline
> 2015-09-30  Bernd Edlinger  
>
> PR rtl-optimization/67037
> * lra-constraints.c (process_addr_reg): Use copy_rtx when necessary.
>
> testsuite:
> 2015-09-30  Bernd Edlinger  
>
> PR rtl-optimization/67037
> * gcc.c-torture/execute/pr67037.c: New test.
> ---
>  gcc/lra-constraints.c |  2 +-
>  gcc/testsuite/gcc.c-torture/execute/pr67037.c | 49 
> +++
>  2 files changed, 50 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr67037.c
>
> diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c
> index ae8f3cd..919b127 100644
> --- a/gcc/lra-constraints.c
> +++ b/gcc/lra-constraints.c
> @@ -1203,7 +1203,7 @@ process_addr_reg (rtx *loc, rtx *before, rtx *after, 
> enum reg_class cl)
>if (after != NULL)
>  {
>start_sequence ();
> -  lra_emit_move (reg, new_reg);
> +  lra_emit_move (before_p ? copy_rtx (reg) : reg, new_reg);
>emit_insn (*after);
>*after = get_insns ();
>end_sequence ();
> diff --git a/gcc/testsuite/gcc.c-torture/execute/pr67037.c 
> b/gcc/testsuite/gcc.c-torture/execute/pr67037.c
> new file mode 100644
> index 000..3119d32
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/execute/pr67037.c
> @@ -0,0 +1,49 @@
> +long (*extfunc)();
> +
> +static inline void lstrcpynW( short *d, const short *s, int n )
> +{
> +unsigned int count = n;
> +
> +while ((count > 1) && *s)
> +{
> +count--;
> +*d++ = *s++;
> +}
> +if (count) *d = 0;
> +}
> +
> +int __attribute__((noinline,noclone))
> +badfunc(int u0, int u1, int u2, int u3,
> +  short *fsname, unsigned int fsname_len)
> +{
> +static const short ntfsW[] = {'N','T','F','S',0};
> +char superblock[2048+3300];
> +int ret = 0;
> +short *p;
> +
> +if (extfunc())
> +return 0;
> +p = (void *)extfunc();
> +if (p != 0)
> +goto done;
> +
> +extfunc(superblock);
> +
> +lstrcpynW(fsname, ntfsW, fsname_len);
> +
> +ret = 1;
> +done:
> +return ret;
> +}
> +
> +static long f()
> +{
> +return 0;
> +}
> +
> +int main()
> +{
> +short buf[6];
> +extfunc = f;
> +return !badfunc(0, 0, 0, 0, buf, 6);
> +}
> --
> 1.9.1
>