Hi! This patch deals just with correctness of vector shifts by scalar non-immediate. The manuals say the shift count is bits [0:63] of the corresponding source operand (XMM reg or memory in some cases), and if the count is bigger than number of bits - 1 in the vector element, it is treated as number of bits shift count. We are modelling it as SImode shift count though, the upper 32 bits may be random in some cases which causes wrong-code. Fixed by using DImode that matches what the insns do.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? Any thoughts on what to do to generate reasonable code when the shift count comes from memory (e.g. as int variable) or is in the low bits of some XMM regioster? First of all, perhaps we could have some combiner (or peephole) pattern that would transform sign-extend from e.g. SI to DI on the shift count into zero-extend if there are no other uses of the extension result - if the shift count is negative in SImode (or even QImode), then it is already large number and the upper 32 bits or more don't really change anything on that. Then perhaps we could emit pmovzxdq for SSE4.1+ instead of going through GPRs and back, or for SSE2 pxor on a scratch reg and punpck* to get it zero extended. Not sure if we want to add =v / vm alternative to zero_extendsidi2*, it already has some x but with ?s that prevent the RA from using it. So thoughts on that? 2017-04-03 Jakub Jelinek <ja...@redhat.com> PR target/80286 * config/i386/i386.c (ix86_expand_args_builtin): If op has scalar int mode, convert_modes it to mode as unsigned, otherwise use lowpart_subreg to mode rather than SImode. * config/i386/sse.md (<mask_codefor>ashr<mode>3<mask_name>, ashr<mode>3, ashr<mode>3<mask_name>, <shift_insn><mode>3<mask_name>): Use DImode instead of SImode for the shift count operand. * config/i386/mmx.md (mmx_ashr<mode>3, mmx_<shift_insn><mode>3): Likewise. testsuite/ * gcc.target/i386/avx-pr80286.c: New test. * gcc.dg/pr80286.c: New test. --- gcc/config/i386/i386.c.jj 2017-04-03 10:40:22.000000000 +0200 +++ gcc/config/i386/i386.c 2017-04-03 18:31:39.482367634 +0200 @@ -35582,10 +35582,17 @@ ix86_expand_args_builtin (const struct b { /* SIMD shift insns take either an 8-bit immediate or register as count. But builtin functions take int as - count. If count doesn't match, we put it in register. */ + count. If count doesn't match, we put it in register. + The instructions are using 64-bit count, if op is just + 32-bit, zero-extend it, as negative shift counts + are undefined behavior and zero-extension is more + efficient. */ if (!match) { - op = lowpart_subreg (SImode, op, GET_MODE (op)); + if (SCALAR_INT_MODE_P (GET_MODE (op))) + op = convert_modes (mode, GET_MODE (op), op, 1); + else + op = lowpart_subreg (mode, op, GET_MODE (op)); if (!insn_p->operand[i + 1].predicate (op, mode)) op = copy_to_reg (op); } --- gcc/config/i386/sse.md.jj 2017-04-03 13:43:50.179572564 +0200 +++ gcc/config/i386/sse.md 2017-04-03 18:01:19.713852914 +0200 @@ -10620,7 +10620,7 @@ (define_insn "<mask_codefor>ashr<mode>3< [(set (match_operand:VI24_AVX512BW_1 0 "register_operand" "=v,v") (ashiftrt:VI24_AVX512BW_1 (match_operand:VI24_AVX512BW_1 1 "nonimmediate_operand" "v,vm") - (match_operand:SI 2 "nonmemory_operand" "v,N")))] + (match_operand:DI 2 "nonmemory_operand" "v,N")))] "TARGET_AVX512VL" "vpsra<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "type" "sseishft") @@ -10634,7 +10634,7 @@ (define_insn "ashr<mode>3" [(set (match_operand:VI24_AVX2 0 "register_operand" "=x,x") (ashiftrt:VI24_AVX2 (match_operand:VI24_AVX2 1 "register_operand" "0,x") - (match_operand:SI 2 "nonmemory_operand" "xN,xN")))] + (match_operand:DI 2 "nonmemory_operand" "xN,xN")))] "TARGET_SSE2" "@ psra<ssemodesuffix>\t{%2, %0|%0, %2} @@ -10667,7 +10667,7 @@ (define_insn "ashr<mode>3<mask_name>" [(set (match_operand:VI248_AVX512BW_AVX512VL 0 "register_operand" "=v,v") (ashiftrt:VI248_AVX512BW_AVX512VL (match_operand:VI248_AVX512BW_AVX512VL 1 "nonimmediate_operand" "v,vm") - (match_operand:SI 2 "nonmemory_operand" "v,N")))] + (match_operand:DI 2 "nonmemory_operand" "v,N")))] "TARGET_AVX512F" "vpsra<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "type" "sseishft") @@ -10681,7 +10681,7 @@ (define_insn "<shift_insn><mode>3<mask_n [(set (match_operand:VI2_AVX2_AVX512BW 0 "register_operand" "=x,v") (any_lshift:VI2_AVX2_AVX512BW (match_operand:VI2_AVX2_AVX512BW 1 "register_operand" "0,v") - (match_operand:SI 2 "nonmemory_operand" "xN,vN")))] + (match_operand:DI 2 "nonmemory_operand" "xN,vN")))] "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" "@ p<vshift><ssemodesuffix>\t{%2, %0|%0, %2} @@ -10700,7 +10700,7 @@ (define_insn "<shift_insn><mode>3<mask_n [(set (match_operand:VI48_AVX2 0 "register_operand" "=x,x,v") (any_lshift:VI48_AVX2 (match_operand:VI48_AVX2 1 "register_operand" "0,x,v") - (match_operand:SI 2 "nonmemory_operand" "xN,xN,vN")))] + (match_operand:DI 2 "nonmemory_operand" "xN,xN,vN")))] "TARGET_SSE2 && <mask_mode512bit_condition>" "@ p<vshift><ssemodesuffix>\t{%2, %0|%0, %2} @@ -10720,7 +10720,7 @@ (define_insn "<shift_insn><mode>3<mask_n [(set (match_operand:VI48_512 0 "register_operand" "=v,v") (any_lshift:VI48_512 (match_operand:VI48_512 1 "nonimmediate_operand" "v,m") - (match_operand:SI 2 "nonmemory_operand" "vN,N")))] + (match_operand:DI 2 "nonmemory_operand" "vN,N")))] "TARGET_AVX512F && <mask_mode512bit_condition>" "vp<vshift><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "isa" "avx512f") --- gcc/config/i386/mmx.md.jj 2017-04-03 13:43:50.119573339 +0200 +++ gcc/config/i386/mmx.md 2017-04-03 18:01:19.708852979 +0200 @@ -930,7 +930,7 @@ (define_insn "mmx_ashr<mode>3" [(set (match_operand:MMXMODE24 0 "register_operand" "=y") (ashiftrt:MMXMODE24 (match_operand:MMXMODE24 1 "register_operand" "0") - (match_operand:SI 2 "nonmemory_operand" "yN")))] + (match_operand:DI 2 "nonmemory_operand" "yN")))] "TARGET_MMX" "psra<mmxvecsize>\t{%2, %0|%0, %2}" [(set_attr "type" "mmxshft") @@ -944,7 +944,7 @@ (define_insn "mmx_<shift_insn><mode>3" [(set (match_operand:MMXMODE248 0 "register_operand" "=y") (any_lshift:MMXMODE248 (match_operand:MMXMODE248 1 "register_operand" "0") - (match_operand:SI 2 "nonmemory_operand" "yN")))] + (match_operand:DI 2 "nonmemory_operand" "yN")))] "TARGET_MMX" "p<vshift><mmxvecsize>\t{%2, %0|%0, %2}" [(set_attr "type" "mmxshft") --- gcc/testsuite/gcc.target/i386/avx-pr80286.c.jj 2017-04-03 18:44:07.552698281 +0200 +++ gcc/testsuite/gcc.target/i386/avx-pr80286.c 2017-04-03 18:43:51.000000000 +0200 @@ -0,0 +1,26 @@ +/* PR target/80286 */ +/* { dg-do run { target avx } } */ +/* { dg-options "-O2 -mavx" } */ + +#include "avx-check.h" +#include <immintrin.h> + +__m256i m; + +__attribute__((noinline, noclone)) __m128i +foo (__m128i x) +{ + int s = _mm_cvtsi128_si32 (_mm256_castsi256_si128 (m)); + return _mm_srli_epi16 (x, s); +} + +static void +avx_test (void) +{ + __m128i a = (__m128i) (__v8hi) { 1 << 7, 2 << 8, 3 << 9, 4 << 10, 5 << 11, 6 << 12, 7 << 13, 8 << 12 }; + m = (__m256i) (__v8si) { 7, 8, 9, 10, 11, 12, 13, 14 }; + __m128i c = foo (a); + __m128i b = (__m128i) (__v8hi) { 1, 2 << 1, 3 << 2, 4 << 3, 5 << 4, 6 << 5, 7 << 6, 8 << 5 }; + if (__builtin_memcmp (&c, &b, sizeof (__m128i))) + __builtin_abort (); +} --- gcc/testsuite/gcc.dg/pr80286.c.jj 2017-04-03 18:45:27.574663948 +0200 +++ gcc/testsuite/gcc.dg/pr80286.c 2017-04-03 18:45:18.386782707 +0200 @@ -0,0 +1,23 @@ +/* PR target/80286 */ +/* { dg-do run } */ +/* { dg-options "-O2 -Wno-psabi" } */ + +typedef int V __attribute__((vector_size (4 * sizeof (int)))); + +__attribute__((noinline, noclone)) V +foo (V x, V y) +{ + return x << y[0]; +} + +int +main () +{ + V x = { 1, 2, 3, 4 }; + V y = { 5, 6, 7, 8 }; + V z = foo (x, y); + V e = { 1 << 5, 2 << 5, 3 << 5, 4 << 5 }; + if (__builtin_memcmp (&z, &e, sizeof (V))) + __builtin_abort (); + return 0; +} Jakub