OK.
Thanks for the patch!

> The BPF backend expansion of setmem was broken, because it could elect
> to use stores of HI, SI or DI modes based on the destitnation alignement
> when the value was QI, but fail to duplicate the byte value across to
> those larger sizes.  This resulted in not all bytes bytes of the
> destination actually being set to the desired value.
>
> Fix bpf_expand_setmem to ensure the desired byte value is really
> duplicated as necessary, whether it is constant or a (sub)reg:QI.
>
> Tested for bpf-unknown-none target on x86_64-linux-gnu host.
>
> Sanity checked by compiling kernel bpf-next selftests, and
> spot-checking the resulting asm for affected __builtin_memsets
> used there.
>
>       PR target/122139
>
> gcc/
>
>       * config/bpf/bpf.cc (bpf_expand_setmem): Duplicate byte value
>       across to new mode when using larger modes for store.
>
> gcc/testsuite/
>
>       * gcc.target/bpf/memset-3.c: New.
>       * gcc.target/bpf/memset-4.c: New.
> ---
>  gcc/config/bpf/bpf.cc                   | 65 +++++++++++++++++++++++--
>  gcc/testsuite/gcc.target/bpf/memset-3.c | 56 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/bpf/memset-4.c | 24 +++++++++
>  3 files changed, 141 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/bpf/memset-3.c
>  create mode 100644 gcc/testsuite/gcc.target/bpf/memset-4.c
>
> diff --git a/gcc/config/bpf/bpf.cc b/gcc/config/bpf/bpf.cc
> index 2e7474be1ff..a28018b3367 100644
> --- a/gcc/config/bpf/bpf.cc
> +++ b/gcc/config/bpf/bpf.cc
> @@ -1427,25 +1427,82 @@ bpf_expand_setmem (rtx *operands)
>    unsigned inc = GET_MODE_SIZE (mode);
>    unsigned offset = 0;
>  
> +  /* If val is a constant, then build a new constant value duplicating
> +     the byte across to the size of stores we might do.
> +       e.g. if val is 0xab and we can store in 4-byte chunks, build
> +       0xabababab and use that to do the memset.
> +     If val is not a constant, then by constraint it is a QImode register
> +     and we similarly duplicate the byte across.  */
> +  rtx src;
> +  if (CONST_INT_P (val))
> +    {
> +      unsigned HOST_WIDE_INT tmp = UINTVAL (val) & 0xff;
> +      /* Need src in the proper mode.  */
> +      switch (mode)
> +     {
> +     case DImode:
> +       src = gen_rtx_CONST_INT (DImode, tmp * 0x0101010101010101);
> +       break;
> +     case SImode:
> +       src = gen_rtx_CONST_INT (SImode, tmp * 0x01010101);
> +       break;
> +     case HImode:
> +       src = gen_rtx_CONST_INT (HImode, tmp * 0x0101);
> +       break;
> +     default:
> +       src = val;
> +       break;
> +     }
> +    }
> +  else
> +    {
> +      /* VAL is a subreg:QI (reg:DI N).
> +      Copy that byte to fill the whole register.  */
> +      src = gen_reg_rtx (mode);
> +      emit_move_insn (src, gen_rtx_ZERO_EXTEND (mode, val));
> +
> +      /* We can fill the whole register with copies of the byte by 
> multiplying
> +      by 0x010101...
> +      For DImode this requires a tmp reg with lldw, but only if we will
> +      actually do nonzero iterations of stxdw.  */
> +      if (mode < DImode || iters == 0)
> +     emit_move_insn (src, gen_rtx_MULT (mode, src, GEN_INT (0x01010101)));
> +      else
> +     {
> +       rtx tmp = gen_reg_rtx (mode);
> +       emit_move_insn (tmp, GEN_INT (0x0101010101010101));
> +       emit_move_insn (src, gen_rtx_MULT (mode, src, tmp));
> +     }
> +    }
> +
>    for (unsigned int i = 0; i < iters; i++)
>      {
> -      emit_move_insn (adjust_address (dst, mode, offset), val);
> +      emit_move_insn (adjust_address (dst, mode, offset), src);
>        offset += inc;
>      }
>    if (remainder & 4)
>      {
> -      emit_move_insn (adjust_address (dst, SImode, offset), val);
> +      emit_move_insn (adjust_address (dst, SImode, offset),
> +                   REG_P (src)
> +                   ? simplify_gen_subreg (SImode, src, mode, 0)
> +                   : src);
>        offset += 4;
>        remainder -= 4;
>      }
>    if (remainder & 2)
>      {
> -      emit_move_insn (adjust_address (dst, HImode, offset), val);
> +      emit_move_insn (adjust_address (dst, HImode, offset),
> +                   REG_P (src)
> +                   ? simplify_gen_subreg (HImode, src, mode, 0)
> +                   : src);
>        offset += 2;
>        remainder -= 2;
>      }
>    if (remainder & 1)
> -    emit_move_insn (adjust_address (dst, QImode, offset), val);
> +    emit_move_insn (adjust_address (dst, QImode, offset),
> +                 REG_P (src)
> +                 ? simplify_gen_subreg (QImode, src, mode, 0)
> +                 : src);
>  
>    return true;
>  }
> diff --git a/gcc/testsuite/gcc.target/bpf/memset-3.c 
> b/gcc/testsuite/gcc.target/bpf/memset-3.c
> new file mode 100644
> index 00000000000..0b044a07a4e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/bpf/memset-3.c
> @@ -0,0 +1,56 @@
> +/* Test that inline memset expansion properly duplicates the byte value
> +   across the bytes to fill.  PR target/122139.  */
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -masm=normal" } */
> +
> +#define SIZE 63
> +
> +unsigned char  cdata[SIZE];
> +unsigned short sdata[SIZE / 2 + 1];
> +unsigned int   idata[SIZE / 4 + 1];
> +unsigned long  ldata[SIZE / 8 + 1];
> +
> +void
> +a (void)
> +{
> +  __builtin_memset (cdata, 0x54, SIZE);
> +}
> +/* 0x54=84 */
> +/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,84" 63 } } */
> +
> +void
> +b (void)
> +{
> +  __builtin_memset (sdata, 0x7a, SIZE);
> +}
> +
> +/* 0x7a=122, 0x7a7a=31354 */
> +/* { dg-final { scan-assembler-times "\[\t \]sth\[^\r\n\]+,31354" 31 } } */
> +/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,122" 1 } } */
> +
> +void
> +c (void)
> +{
> +  __builtin_memset (idata, 0x23, SIZE);
> +}
> +
> +/* 0x23=35, 0x2323=8995, 0x23232323=589505315 */
> +/* { dg-final { scan-assembler-times "\[\t \]stw\[^\r\n\]+,589505315" 15 } } 
> */
> +/* { dg-final { scan-assembler-times "\[\t \]sth\[^\r\n\]+,8995" 1 } } */
> +/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,35" 1 } } */
> +
> +void
> +d (void)
> +{
> +  __builtin_memset (ldata, 0xcb, SIZE);
> +}
> +
> +/* 0xcbcbcbcb_cbcbcbcb = -3761688987579986997,
> +   0xcbcbcbcb = -875836469
> +   0xcbcb = -13365
> +   0xcb = -53 */
> +/* { dg-final { scan-assembler-times "lddw\t%r.,-3761688987579986997"}} */
> +/* { dg-final { scan-assembler-times "stxdw" 7 } } */
> +/* { dg-final { scan-assembler-times "\[\t \]stw\[^\r\n\]+,-875836469" 1 } } 
> */
> +/* { dg-final { scan-assembler-times "\[\t \]sth\[^\r\n\]+,-13365" 1 } } */
> +/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,-53" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/bpf/memset-4.c 
> b/gcc/testsuite/gcc.target/bpf/memset-4.c
> new file mode 100644
> index 00000000000..0c835c5e4d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/bpf/memset-4.c
> @@ -0,0 +1,24 @@
> +/* Test that inline memset expansion properly duplicates the byte value
> +   across the bytes to fill for non-const value.  PR target/122139.  */
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -masm=normal" } */
> +
> +#define SIZE 63
> +
> +unsigned char  cdata[SIZE];
> +unsigned short sdata[SIZE / 2 + 1];
> +unsigned int   idata[SIZE / 4 + 1];
> +unsigned long  ldata[SIZE / 8 + 1];
> +
> +void
> +c (unsigned char byte)
> +{
> +  __builtin_memset (idata, byte, SIZE);
> +}
> +
> +/* Hard to verify for non-const value.  Look for the mul by 0x01010101
> +   and the proper number of stores...  */
> +/* { dg-final { scan-assembler "mul32\[\t \]%r.,16843009" } } */
> +/* { dg-final { scan-assembler-times "stxw" 15 } } */
> +/* { dg-final { scan-assembler-times "stxh" 1 } } */
> +/* { dg-final { scan-assembler-times "stxb" 1 } } */

Reply via email to