On Fri, May 15, 2020 at 8:27 AM Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Fri, May 15, 2020 at 1:13 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > The -mgeneral-regs-only option generates code that uses only the > > general-purpose registers. It prevents the compiler from using vector > > registers. But GCC may still generate calls to memcpy, memmove, memset > > and memcmp library functions. In the GNU C library, these library > > functions are implementated with vector registers, which makes the > > -mgeneral-regs-only option less effective. The new -mavoid-libcall > > option expands memcpy, memmove and memset into REP MOVSB and REP STOSB > > sequence. This option can be further enhanced with a cmpmem pattern > > to expand memcmp into REP CMPSB sequence in the future. > > > > Tested on Linux/x86 and Linux/x86-64. OK for master? > > No. Library should provide functions that are appropriate for your > target. There are probably other places in the library that use XMM > registers, so there is no point working around only some specific > functions.
For those specific functions -minline-all-stringops should also work, no? Richard. > Uros. > > > Thanks. > > > > H.J. > > --- > > gcc/ > > > > PR target/95134 > > * config/i386/i386-expand.c (alg_usable_p): Return false for > > libcall with -mavoid-libcall. > > (decide_alg): Avoid libcall and rep_prefix_1_byte instead of > > libcall with -mavoid-libcall. > > * config/i386/i386.opt: Add -mavoid-libcall. > > * doc/invoke.texi: Document -mavoid-libcall. > > > > gcc/testsuite/ > > > > PR target/95134 > > * gcc.target/i386/pr95134-1.c: New test. > > * gcc.target/i386/pr95134-2.c: Likewise. > > * gcc.target/i386/pr95134-3.c: Likewise. > > * gcc.target/i386/pr95134-4.c: Likewise. > > --- > > gcc/config/i386/i386-expand.c | 15 ++++++++++----- > > gcc/config/i386/i386.opt | 6 +++++- > > gcc/doc/invoke.texi | 10 +++++++++- > > gcc/testsuite/gcc.target/i386/pr95134-1.c | 18 ++++++++++++++++++ > > gcc/testsuite/gcc.target/i386/pr95134-2.c | 18 ++++++++++++++++++ > > gcc/testsuite/gcc.target/i386/pr95134-3.c | 18 ++++++++++++++++++ > > gcc/testsuite/gcc.target/i386/pr95134-4.c | 11 +++++++++++ > > 7 files changed, 89 insertions(+), 7 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-4.c > > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > > index 26531585c5f..b38463bf88c 100644 > > --- a/gcc/config/i386/i386-expand.c > > +++ b/gcc/config/i386/i386-expand.c > > @@ -6816,7 +6816,7 @@ alg_usable_p (enum stringop_alg alg, bool memset, > > bool have_as) > > || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])) > > return false; > > } > > - return true; > > + return !flag_avoid_libcall || alg != libcall; > > } > > > > /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. > > */ > > @@ -6889,7 +6889,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT > > expected_size, > > setup. */ > > else if (expected_size != -1 && expected_size < 4) > > return loop_1_byte; > > - else if (expected_size != -1) > > + else if (expected_size != -1 && !flag_avoid_libcall) > > { > > enum stringop_alg alg = libcall; > > bool alg_noalign = false; > > @@ -6934,6 +6934,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT > > expected_size, > > } > > } > > } > > + > > + enum stringop_alg alg; > > + > > /* When asked to inline the call anyway, try to pick meaningful choice. > > We look for maximal size of block that is faster to copy by hand and > > take blocks of at most of that size guessing that average size will > > @@ -6945,7 +6948,6 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT > > expected_size, > > && (algs->unknown_size == libcall > > || !alg_usable_p (algs->unknown_size, memset, have_as))) > > { > > - enum stringop_alg alg; > > HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2; > > > > /* If there aren't any usable algorithms or if recursing already, > > @@ -6967,8 +6969,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT > > expected_size, > > gcc_assert (alg != libcall); > > return alg; > > } > > - return (alg_usable_p (algs->unknown_size, memset, have_as) > > - ? algs->unknown_size : libcall); > > + alg = (alg_usable_p (algs->unknown_size, memset, have_as) > > + ? algs->unknown_size : libcall); > > + if (flag_avoid_libcall && alg == libcall) > > + alg = rep_prefix_1_byte; > > + return alg; > > } > > > > /* Decide on alignment. We know that the operand is already aligned to > > ALIGN > > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt > > index c9f7195d423..23b401bd424 100644 > > --- a/gcc/config/i386/i386.opt > > +++ b/gcc/config/i386/i386.opt > > @@ -1114,4 +1114,8 @@ Support SERIALIZE built-in functions and code > > generation. > > > > mtsxldtrk > > Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save > > -Support TSXLDTRK built-in functions and code generation. > > \ No newline at end of file > > +Support TSXLDTRK built-in functions and code generation. > > + > > +mavoid-libcall > > +Target Report Var(flag_avoid_libcall) Init(0) > > +Avoid generation of libcall. > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > > index 850aeac033d..0d2d70419d5 100644 > > --- a/gcc/doc/invoke.texi > > +++ b/gcc/doc/invoke.texi > > @@ -1364,7 +1364,7 @@ See RS/6000 and PowerPC Options. > > -mstack-protector-guard-reg=@var{reg} @gol > > -mstack-protector-guard-offset=@var{offset} @gol > > -mstack-protector-guard-symbol=@var{symbol} @gol > > --mgeneral-regs-only -mcall-ms2sysv-xlogues @gol > > +-mgeneral-regs-only -mavoid-libcall -mcall-ms2sysv-xlogues @gol > > -mindirect-branch=@var{choice} -mfunction-return=@var{choice} @gol > > -mindirect-branch-register} > > > > @@ -30115,6 +30115,14 @@ Generate code that uses only the general-purpose > > registers. This > > prevents the compiler from using floating-point, vector, mask and bound > > registers. > > > > +@item -mavoid-libcall > > +@opindex mavoid-libcall > > +Avoid generation of calls to @code{memcpy}, @code{memmove} and > > +@code{memset} library functions. It can be used together with the > > +option @option{-mgeneral-regs-only} to avoid implicit vector register > > +usage in @code{memcpy}, @code{memmove} and @code{memset} library > > +functions. > > + > > @item -mindirect-branch=@var{choice} > > @opindex mindirect-branch > > Convert indirect call and jump with @var{choice}. The default is > > diff --git a/gcc/testsuite/gcc.target/i386/pr95134-1.c > > b/gcc/testsuite/gcc.target/i386/pr95134-1.c > > new file mode 100644 > > index 00000000000..8ffa680559d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr95134-1.c > > @@ -0,0 +1,18 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=skylake" } > > */ > > + > > +struct foo > > +{ > > + char array[513]; > > +}; > > + > > +extern struct foo x; > > + > > +int > > +func (void) > > +{ > > + __builtin_memset (&x, 0, sizeof (x)); > > + return 0; > > +} > > + > > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memset" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr95134-2.c > > b/gcc/testsuite/gcc.target/i386/pr95134-2.c > > new file mode 100644 > > index 00000000000..7c6c42a736d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr95134-2.c > > @@ -0,0 +1,18 @@ > > +/* { dg-do compile { target ia32 } } */ > > +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=pentium" } > > */ > > + > > +struct foo > > +{ > > + char array[257]; > > +}; > > + > > +extern struct foo x; > > + > > +int > > +func (struct foo i) > > +{ > > + x = i; > > + return 0; > > +} > > + > > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr95134-3.c > > b/gcc/testsuite/gcc.target/i386/pr95134-3.c > > new file mode 100644 > > index 00000000000..4e4428cd0ae > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr95134-3.c > > @@ -0,0 +1,18 @@ > > +/* { dg-do compile { target ia32 } } */ > > +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=pentium" } > > */ > > + > > +struct foo > > +{ > > + char array[257]; > > +}; > > + > > +extern struct foo x; > > + > > +int > > +func (struct foo i) > > +{ > > + __builtin_memcpy (&x, &i, sizeof (x)); > > + return 0; > > +} > > + > > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr95134-4.c > > b/gcc/testsuite/gcc.target/i386/pr95134-4.c > > new file mode 100644 > > index 00000000000..d1bd8fbf4c1 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr95134-4.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall" } */ > > + > > +int > > +func (void *d, void *s, unsigned int l) > > +{ > > + __builtin_memcpy (d, s, l); > > + return 0; > > +} > > + > > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */ > > -- > > 2.26.2 > >