Hi! The code we emit on the following testcases is really terrible, with both -mavx512f -mno-avx512bw as well as -mavx512bw, rather than doing e.g. vpinsrw $0, %edi, %xmm0, %xmm1 vinserti32x4 $0, %xmm1, %zmm0, %zmm0 when trying to insert into low 128-bits or vextracti32x4 $1, %zmm0, %xmm1 vpinsrw $3, %edi, %xmm1, %xmm1 vinserti32x4 $1, %xmm1, %zmm0, %zmm0 when trying to insert into other 128-bits, we emit: pushq %rbp vmovq %xmm0, %rax movzwl %di, %edi xorw %ax, %ax movq %rsp, %rbp orq %rdi, %rax andq $-64, %rsp vmovdqa64 %zmm0, -64(%rsp) movq %rax, -64(%rsp) vmovdqa64 -64(%rsp), %zmm0 leave and furthermore there is some RA bug it triggers, so we miscompile it. All this is because while at least for AVX512BW we have ix86_expand_vector_set implemented for V64QImode and V32QImode, we actually don't have a pattern for it. Fixed by adding those modes to V, which is used only by this vec_set pattern and some xop pattern, which is misusing it anyway (it really can't handle 512-bit vectors, so could result in assembly failures etc. with -mxop -mavx512f). Furthermore, for AVX512F we can use the above extraction/insertion/insertion sequence, similarly to what we do for 256-bit insertions, just we have halves there rather than quarters. The splitters are added to match what vec_set_lo_* define_insn_and_split do, if we are trying to extract low 128-bit lane of a 512-bit vector, we really don't need vextracti32x4 instruction, we can use simple move or even nothing at all (if source and destination are the same register, or post RA register renaming can arrange that).
The RA bug still should be fixed, but at least this patch makes it latent and improves a lot the code. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2018-03-30 Jakub Jelinek <ja...@redhat.com> PR middle-end/85090 * config/i386/sse.md (V): Add V64QI and V32HI for TARGET_AVX512F. (V_128_256): New mode iterator. (*avx512dq_vextract<shuffletype>64x2_1 splitter): New define_split. (*avx512f_vextract<shuffletype>32x4_1 splitter): Likewise. (xop_pcmov_<mode><avxsizesuffix>): Use V_128_256 mode iterator instead of V. * config/i386/i386.c (ix86_expand_vector_set): Improve V32HImode and V64QImode expansion for !TARGET_AVX512BW && TARGET_AVX512F. * gcc.target/i386/avx512f-pr85090-1.c: New test. * gcc.target/i386/avx512f-pr85090-2.c: New test. * gcc.target/i386/avx512f-pr85090-3.c: New test. * gcc.target/i386/avx512bw-pr85090-2.c: New test. * gcc.target/i386/avx512bw-pr85090-3.c: New test. --- gcc/config/i386/sse.md.jj 2018-03-29 12:38:32.088512220 +0200 +++ gcc/config/i386/sse.md 2018-03-30 14:56:19.949549653 +0200 @@ -229,8 +229,8 @@ (define_mode_iterator VI1_AVX512VL ;; All vector modes (define_mode_iterator V - [(V32QI "TARGET_AVX") V16QI - (V16HI "TARGET_AVX") V8HI + [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI + (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF @@ -244,6 +244,10 @@ (define_mode_iterator V_128 (define_mode_iterator V_256 [V32QI V16HI V8SI V4DI V8SF V4DF]) +;; All 128bit and 256bit vector modes +(define_mode_iterator V_128_256 + [V32QI V16QI V16HI V8HI V8SI V4SI V4DI V2DI V8SF V4SF V4DF V2DF]) + ;; All 512bit vector modes (define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF V8DF]) @@ -7351,6 +7355,15 @@ (define_insn "<mask_codefor>avx512dq_vex (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_split + [(set (match_operand:<ssequartermode> 0 "nonimmediate_operand") + (vec_select:<ssequartermode> + (match_operand:V8FI 1 "register_operand") + (parallel [(const_int 0) (const_int 1)])))] + "TARGET_AVX512DQ && reload_completed" + [(set (match_dup 0) (match_dup 1))] + "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);") + (define_insn "<mask_codefor>avx512f_vextract<shuffletype>32x4_1<mask_name>" [(set (match_operand:<ssequartermode> 0 "<store_mask_predicate>" "=<store_mask_constraint>") (vec_select:<ssequartermode> @@ -7374,6 +7387,16 @@ (define_insn "<mask_codefor>avx512f_vext (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_split + [(set (match_operand:<ssequartermode> 0 "nonimmediate_operand") + (vec_select:<ssequartermode> + (match_operand:V16FI 1 "register_operand") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])))] + "TARGET_AVX512F && reload_completed" + [(set (match_dup 0) (match_dup 1))] + "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);") + (define_mode_attr extract_type_2 [(V16SF "avx512dq") (V16SI "avx512dq") (V8DF "avx512f") (V8DI "avx512f")]) @@ -16478,11 +16501,11 @@ (define_insn "xop_p<madcs>wd" ;; XOP parallel XMM conditional moves (define_insn "xop_pcmov_<mode><avxsizesuffix>" - [(set (match_operand:V 0 "register_operand" "=x,x") - (if_then_else:V - (match_operand:V 3 "nonimmediate_operand" "x,m") - (match_operand:V 1 "register_operand" "x,x") - (match_operand:V 2 "nonimmediate_operand" "xm,x")))] + [(set (match_operand:V_128_256 0 "register_operand" "=x,x") + (if_then_else:V_128_256 + (match_operand:V_128_256 3 "nonimmediate_operand" "x,m") + (match_operand:V_128_256 1 "register_operand" "x,x") + (match_operand:V_128_256 2 "nonimmediate_operand" "xm,x")))] "TARGET_XOP" "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "sse4arg")]) --- gcc/config/i386/i386.c.jj 2018-03-28 21:14:29.533744671 +0200 +++ gcc/config/i386/i386.c 2018-03-29 21:16:06.154318265 +0200 @@ -44085,21 +44085,69 @@ half: break; case E_V32HImode: - if (TARGET_AVX512F && TARGET_AVX512BW) + if (TARGET_AVX512BW) { mmode = SImode; gen_blendm = gen_avx512bw_blendmv32hi; } + else if (TARGET_AVX512F) + { + half_mode = E_V8HImode; + n = 8; + goto quarter; + } break; case E_V64QImode: - if (TARGET_AVX512F && TARGET_AVX512BW) + if (TARGET_AVX512BW) { mmode = DImode; gen_blendm = gen_avx512bw_blendmv64qi; } + else if (TARGET_AVX512F) + { + half_mode = E_V16QImode; + n = 16; + goto quarter; + } break; +quarter: + /* Compute offset. */ + i = elt / n; + elt %= n; + + gcc_assert (i <= 3); + + { + /* Extract the quarter. */ + tmp = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_lowpart (V16SImode, target); + rtx mask = gen_reg_rtx (QImode); + + emit_move_insn (mask, constm1_rtx); + emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i), + tmp, mask)); + + tmp2 = gen_reg_rtx (half_mode); + emit_move_insn (tmp2, gen_lowpart (half_mode, tmp)); + tmp = tmp2; + + /* Put val in tmp at elt. */ + ix86_expand_vector_set (false, tmp, val, elt); + + /* Put it back. */ + tmp2 = gen_reg_rtx (V16SImode); + rtx tmp3 = gen_lowpart (V16SImode, target); + mask = gen_reg_rtx (HImode); + emit_move_insn (mask, constm1_rtx); + tmp = gen_lowpart (V4SImode, tmp); + emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i), + tmp3, mask)); + emit_move_insn (target, gen_lowpart (mode, tmp2)); + } + return; + default: break; } --- gcc/testsuite/gcc.target/i386/avx512f-pr85090-1.c.jj 2018-03-30 15:04:47.964730406 +0200 +++ gcc/testsuite/gcc.target/i386/avx512f-pr85090-1.c 2018-03-30 15:04:26.722722711 +0200 @@ -0,0 +1,35 @@ +/* PR middle-end/85090 */ +/* { dg-do run { target int128 } } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-options "-O2 -fno-tree-dominator-opts -mavx512f -fira-algorithm=priority" } */ + +#include "avx512f-check.h" + +typedef unsigned short U __attribute__ ((vector_size (64))); +typedef unsigned int V __attribute__ ((vector_size (64))); +typedef unsigned __int128 W __attribute__ ((vector_size (64))); + +V h; +W d, e, g; +U f; + +static __attribute__((noipa)) U +foo (U i) +{ + f >>= ((U)d > f) & 1; + i[0] <<= 1; + e = (7 & -d) << (7 & -(g & 7)); + return i; +} + +void +avx512f_test (void) +{ + U x; + for (unsigned i = 0; i < 32; i++) + x[i] = i; + x = foo (x); + for (unsigned i = 0; i < 32; i++) + if (x[i] != i) + abort (); +} --- gcc/testsuite/gcc.target/i386/avx512f-pr85090-2.c.jj 2018-03-30 16:18:01.744300954 +0200 +++ gcc/testsuite/gcc.target/i386/avx512f-pr85090-2.c 2018-03-30 16:17:26.674285394 +0200 @@ -0,0 +1,37 @@ +/* PR middle-end/85090 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f -mno-avx512bw -mtune=intel -masm=att" } */ + +typedef short V __attribute__((vector_size (64))); + +V +f1 (V x, int y) +{ + x[0] = y; + return x; +} + +V +f2 (V x, int y) +{ + x[7] = y; + return x; +} + +V +f3 (V x, int y) +{ + x[11] = y; + return x; +} + +V +f4 (V x, int y) +{ + x[29] = y; + return x; +} + +/* { dg-final { scan-assembler-times "vpinsrw\t" 4 } } */ +/* { dg-final { scan-assembler-times "vextracti32x4\t" 2 } } */ +/* { dg-final { scan-assembler-times "vinserti32x4\t" 4 } } */ --- gcc/testsuite/gcc.target/i386/avx512f-pr85090-3.c.jj 2018-03-30 16:21:01.358380633 +0200 +++ gcc/testsuite/gcc.target/i386/avx512f-pr85090-3.c 2018-03-30 16:21:27.633392302 +0200 @@ -0,0 +1,37 @@ +/* PR middle-end/85090 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f -mno-avx512bw -mtune=intel -masm=att" } */ + +typedef signed char V __attribute__((vector_size (64))); + +V +f1 (V x, int y) +{ + x[0] = y; + return x; +} + +V +f2 (V x, int y) +{ + x[15] = y; + return x; +} + +V +f3 (V x, int y) +{ + x[22] = y; + return x; +} + +V +f4 (V x, int y) +{ + x[59] = y; + return x; +} + +/* { dg-final { scan-assembler-times "vpinsrb\t" 4 } } */ +/* { dg-final { scan-assembler-times "vextracti32x4\t" 2 } } */ +/* { dg-final { scan-assembler-times "vinserti32x4\t" 4 } } */ --- gcc/testsuite/gcc.target/i386/avx512bw-pr85090-2.c.jj 2018-03-30 16:19:34.986342321 +0200 +++ gcc/testsuite/gcc.target/i386/avx512bw-pr85090-2.c 2018-03-30 16:19:07.650330188 +0200 @@ -0,0 +1,35 @@ +/* PR middle-end/85090 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512bw -mtune=intel -masm=att" } */ + +typedef short V __attribute__((vector_size (64))); + +V +f1 (V x, int y) +{ + x[0] = y; + return x; +} + +V +f2 (V x, int y) +{ + x[7] = y; + return x; +} + +V +f3 (V x, int y) +{ + x[11] = y; + return x; +} + +V +f4 (V x, int y) +{ + x[29] = y; + return x; +} + +/* { dg-final { scan-assembler-times "vpbroadcastw\t" 4 } } */ --- gcc/testsuite/gcc.target/i386/avx512bw-pr85090-3.c.jj 2018-03-30 16:20:00.359353581 +0200 +++ gcc/testsuite/gcc.target/i386/avx512bw-pr85090-3.c 2018-03-30 16:20:44.638373224 +0200 @@ -0,0 +1,35 @@ +/* PR middle-end/85090 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512bw -mtune=intel -masm=att" } */ + +typedef signed char V __attribute__((vector_size (64))); + +V +f1 (V x, int y) +{ + x[0] = y; + return x; +} + +V +f2 (V x, int y) +{ + x[15] = y; + return x; +} + +V +f3 (V x, int y) +{ + x[22] = y; + return x; +} + +V +f4 (V x, int y) +{ + x[59] = y; + return x; +} + +/* { dg-final { scan-assembler-times "vpbroadcastb\t" 4 } } */ Jakub