I didn't receive
https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572436.html in my
gmail account, does anyone know why?


>--- a/gcc/config/i386/i386-protos.h
>+++ b/gcc/config/i386/i386-protos.h
>@@ -260,6 +260,7 @@ extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, 
>bool, bool);
> extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
> extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
> extern void ix86_expand_sse2_abs (rtx, rtx);
>+extern bool ix86_expand_integer_vec_duplicate (rtx *);
>
> /* In i386-c.c  */
> extern void ix86_target_macros (void);
>diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>index 2a34756be2a..f094e5a2586 100644
>--- a/gcc/config/i386/sse.md
>+++ b/gcc/config/i386/sse.md
>@@ -24570,3 +24570,24 @@
>   "TARGET_WIDEKL"
>   "aes<aeswideklvariant>\t{%0}"
>   [(set_attr "type" "other")])
>+
>+;; Modes handled by broadcast patterns.
>+(define_mode_iterator INT_BROADCAST_MODE
>+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
>+   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
>+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
>+   (V8DI "TARGET_AVX512F && TARGET_64BIT")
>+   (V4DI "TARGET_AVX && TARGET_64BIT") (V2DI "TARGET_64BIT")])
TARGET_AVX512BW is needed for V64QI and V32HImode.
And consider the scenario, TARGET_AVX2 is needed for all other
128/256-bit modes?
>+
>+;; Broadcast from an integer.  NB: Enable broadcast only if we can move
>+;; from GPR to SSE register directly.
>+(define_expand "vec_duplicate<mode>"
>+  [(set (match_operand:INT_BROADCAST_MODE 0 "register_operand")
>+ (vec_duplicate:INT_BROADCAST_MODE
>+   (match_operand:<ssescalarmode> 1 "general_operand")))]
>+  "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC"
Why not directly use TARGET_AVX2 here, not in ix86_expand_integer_vec_duplicate.
Also define some predicate to restrict operand[1] to be const_int but
not const0_rtx and constm1_rtx.
w/ this, guess there's no need for FAIL? and you can directly
call ix86_expand_vector_init_duplicate.
>+{
>+  if (!ix86_expand_integer_vec_duplicate (operands))
>+    FAIL;
>+  DONE;
>

On Fri, Jun 25, 2021 at 1:30 AM H.J. Lu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Wed, Jun 9, 2021 at 4:39 PM H.J. Lu <hjl.to...@gmail.com> wrote:
> >
> > 1. Update move expanders to convert the CONST_WIDE_INT and CONST_VECTO
> > operands to vector broadcast from an integer with AVX2.
> > 2. Add ix86_gen_scratch_sse_rtx to return a scratch SSE register which
> > won't increase stack alignment requirement and blocks transformation by
> > the combine pass.
> >
> > A small benchmark:
> >
> > https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/memset/broadcast
> >
> > shows that broadcast is a little bit faster on Intel Core i7-8559U:
> >
> > $ make
> > gcc -g -I. -O2   -c -o test.o test.c
> > gcc -g   -c -o memory.o memory.S
> > gcc -g   -c -o broadcast.o broadcast.S
> > gcc -g   -c -o vec_dup_sse2.o vec_dup_sse2.S
> > gcc -o test test.o memory.o broadcast.o vec_dup_sse2.o
> > ./test
> > memory      : 147215
> > broadcast   : 121213
> > vec_dup_sse2: 171366
> > $
> >
> > broadcast is also smaller:
> >
> > $ size memory.o broadcast.o
> >    text    data     bss     dec     hex filename
> >     132       0       0     132      84 memory.o
> >     122       0       0     122      7a broadcast.o
> > $
> >
> > 3. Update PR 87767 tests to expect integer broadcast instead of broadcast
> > from memory.
> > 4. Update avx512f_cond_move.c to expect integer broadcast.
> >
> > A small benchmark:
> >
> > https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vpaddd/broadcast
> >
> > shows that integer broadcast is faster than embedded memory broadcast:
> >
> > $ make
> > gcc -g -I. -O2 -march=skylake-avx512   -c -o test.o test.c
> > gcc -g   -c -o memory.o memory.S
> > gcc -g   -c -o broadcast.o broadcast.S
> > gcc -o test test.o memory.o broadcast.o
> > ./test
> > memory      : 425538
> > broadcast   : 375260
> > $
> >
> > 5. Update vec_duplicate to allow to fail so that backend can only allow
> > broadcasting an integer constant to a vector when broadcast instruction
> > is available.  This can be used by memset expander to avoid vec_duplicate
> > when loading from constant pool is more efficient.
> > 6. Add vec_duplicate<mode> expander and enable vec_duplicate from a
> > non-standard SSE constant integer only if vector broadcast is available.
> >
> > H.J. Lu (2):
> >   x86: Convert CONST_WIDE_INT/CONST_VECTOR to broadcast
> >   x86: Add vec_duplicate<mode> expander
> >
> >  gcc/config/i386/i386-expand.c                 | 208 +++++++++++++++++-
> >  gcc/config/i386/i386-protos.h                 |   3 +
> >  gcc/config/i386/i386.c                        |  13 ++
> >  gcc/config/i386/sse.md                        |  21 ++
> >  gcc/doc/md.texi                               |   2 -
> >  .../i386/avx512f-broadcast-pr87767-1.c        |   7 +-
> >  .../i386/avx512f-broadcast-pr87767-5.c        |   5 +-
> >  .../gcc.target/i386/avx512f_cond_move.c       |   4 +-
> >  .../i386/avx512vl-broadcast-pr87767-1.c       |  12 +-
> >  .../i386/avx512vl-broadcast-pr87767-5.c       |   9 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-1.c    |  13 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-10a.c  |  33 +++
> >  gcc/testsuite/gcc.target/i386/pr100865-10b.c  |   7 +
> >  gcc/testsuite/gcc.target/i386/pr100865-11a.c  |  23 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-11b.c  |   8 +
> >  gcc/testsuite/gcc.target/i386/pr100865-12a.c  |  20 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-12b.c  |   8 +
> >  gcc/testsuite/gcc.target/i386/pr100865-2.c    |  14 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-3.c    |  15 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-4a.c   |  16 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-4b.c   |   9 +
> >  gcc/testsuite/gcc.target/i386/pr100865-5a.c   |  16 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-5b.c   |   9 +
> >  gcc/testsuite/gcc.target/i386/pr100865-6a.c   |  16 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-6b.c   |   9 +
> >  gcc/testsuite/gcc.target/i386/pr100865-7a.c   |  17 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-7b.c   |   9 +
> >  gcc/testsuite/gcc.target/i386/pr100865-8a.c   |  24 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-8b.c   |   7 +
> >  gcc/testsuite/gcc.target/i386/pr100865-9a.c   |  25 +++
> >  gcc/testsuite/gcc.target/i386/pr100865-9b.c   |   7 +
> >  31 files changed, 563 insertions(+), 26 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-10a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-10b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-11a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-11b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-12a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-12b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-4a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-4b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-5a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-5b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-6a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-6b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-7a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-7b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-8a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-8b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-9a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-9b.c
> >
> > --
> > 2.31.1
> >
>
> PING^1.
>
> --
> H.J.



-- 
BR,
Hongtao

Reply via email to