On Wed, Aug 4, 2021 at 11:46 AM Uros Bizjak <ubiz...@gmail.com> wrote: > > On Wed, Aug 4, 2021 at 3:34 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit > > > move is enabled since x86 uses vec_duplicate, which is enabled only when > > > inter-unit move is enabled, to implement store_by_pieces. > > > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to > > > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for > > > compare_by_pieces. > > > > > > gcc/ > > > > > > PR target/101742 > > > * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to > > > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES > > > for compare_by_pieces. > > > * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode > > > only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. > > > > > > gcc/testsuite/ > > > > > > PR target/101742 > > > * gcc.target/i386/pr101742a.c: New test. > > > * gcc.target/i386/pr101742b.c: Likewise. > > > --- > > > gcc/config/i386/i386.h | 20 +++++++++++--------- > > > gcc/expr.c | 6 +++++- > > > gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++ > > > 4 files changed, 36 insertions(+), 10 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c > > > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > > index bed9cd9da18..9b416abd5f4 100644 > > > --- a/gcc/config/i386/i386.h > > > +++ b/gcc/config/i386/i386.h > > > @@ -1783,15 +1783,17 @@ typedef struct ix86_args { > > > /* STORE_MAX_PIECES is the number of bytes at a time that we can > > > store efficiently. */ > > > #define STORE_MAX_PIECES \ > > > - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > > > - ? 64 \ > > > - : ((TARGET_AVX \ > > > - && !TARGET_PREFER_AVX128 \ > > > - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > > > - ? 32 \ > > > - : ((TARGET_SSE2 \ > > > - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > > > - ? 16 : UNITS_PER_WORD))) > > > + (TARGET_INTER_UNIT_MOVES_TO_VEC \ > > > + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > > > + ? 64 \ > > > + : ((TARGET_AVX \ > > > + && !TARGET_PREFER_AVX128 \ > > > + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > > > + ? 32 \ > > > + : ((TARGET_SSE2 \ > > > + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > > > + ? 16 : UNITS_PER_WORD))) \ > > > + : UNITS_PER_WORD) > > > > > > /* If a memory-to-memory move would take MOVE_RATIO or more simple > > > move-instruction pairs, we will do a cpymem or libcall instead. > > > > expr.c has been fixed. Here is the v2 patch for x86 backend. > > OK for master? > > OK, but please add the comment about vec_duplicate before the define > to explain the situation with TARGET_INTER_UNIT_MOVES_TO_VEC.
This is what I am checking in with /* STORE_MAX_PIECES is the number of bytes at a time that we can store efficiently. Allow 16/32/64 bytes only if inter-unit move is enabled since vec_duplicate enabled by inter-unit move is used to implement store_by_pieces of 16/32/64 bytes. */ > Thanks, > Uros. Thanks. -- H.J.
From 9487c165afb5b6083a3fc09a2e8b7bcabfe28765 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.to...@gmail.com> Date: Tue, 3 Aug 2021 06:17:22 -0700 Subject: [PATCH v3] x86: Update STORE_MAX_PIECES Update STORE_MAX_PIECES to allow 16/32/64 bytes only if inter-unit move is enabled since vec_duplicate enabled by inter-unit move is used to implement store_by_pieces of 16/32/64 bytes. gcc/ PR target/101742 * config/i386/i386.h (STORE_MAX_PIECES): Allow 16/32/64 bytes only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. gcc/testsuite/ PR target/101742 * gcc.target/i386/pr101742a.c: New test. * gcc.target/i386/pr101742b.c: Likewise. --- gcc/config/i386/i386.h | 26 +++++++++++++---------- gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++ gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++ 3 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index bed9cd9da18..21fe51bba40 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1780,18 +1780,22 @@ typedef struct ix86_args { && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ ? 16 : UNITS_PER_WORD))) -/* STORE_MAX_PIECES is the number of bytes at a time that we can - store efficiently. */ +/* STORE_MAX_PIECES is the number of bytes at a time that we can store + efficiently. Allow 16/32/64 bytes only if inter-unit move is enabled + since vec_duplicate enabled by inter-unit move is used to implement + store_by_pieces of 16/32/64 bytes. */ #define STORE_MAX_PIECES \ - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ - ? 64 \ - : ((TARGET_AVX \ - && !TARGET_PREFER_AVX128 \ - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ - ? 32 \ - : ((TARGET_SSE2 \ - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ - ? 16 : UNITS_PER_WORD))) + (TARGET_INTER_UNIT_MOVES_TO_VEC \ + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ + ? 64 \ + : ((TARGET_AVX \ + && !TARGET_PREFER_AVX128 \ + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + ? 32 \ + : ((TARGET_SSE2 \ + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ + ? 16 : UNITS_PER_WORD))) \ + : UNITS_PER_WORD) /* If a memory-to-memory move would take MOVE_RATIO or more simple move-instruction pairs, we will do a cpymem or libcall instead. diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c new file mode 100644 index 00000000000..67ea40587dd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101742a.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -mtune=nano-x2" } */ + +int n2; + +__attribute__ ((simd)) char +w7 (void) +{ + short int xb = n2; + int qp; + + for (qp = 0; qp < 2; ++qp) + xb = xb < 1; + + return xb; +} diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c new file mode 100644 index 00000000000..ba19064077b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101742b.c @@ -0,0 +1,4 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */ + +#include "pr101742a.c" -- 2.31.1