On Fri, Dec 31, 2021 at 12:21 PM Noah Goldstein <goldstein....@gmail.com> wrote:
>
> On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.to...@gmail.com> wrote:
> >
> > Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
> > lantency and higher throughput than VPBROADCAST, for zero constant.
> > Since the most common usage of memset is to zero a block of memory, the
> > branch predictor will make the compare/jmp basically free and PXOR is
> > almost like being executed unconditionally.
>
> Any benchmark results? Is the broadcast on the critical path for any size?

Can you run your workloads to see how many memset calls are zeroing?

> Also imagine the vast majority of memset zero are compile time known.
>
> I think it might make more sense to give bzero() the fall-through instead and

bzero is an alias of SSE2 memset in glibc.   Should we add __memsetzero
like __memcmpeq?  It should be almost free in glibc.  GCC can use
__memsetzero if it is available.

> add a patch in GCC to prefer bzero > memset.
>
>
> > ---
> >  sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
> >  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
> >  .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
> >  .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
> >  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
> >  5 files changed, 57 insertions(+), 4 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > index 0137eba4cd..513f9c703d 100644
> > --- a/sysdeps/x86_64/memset.S
> > +++ b/sysdeps/x86_64/memset.S
> > @@ -29,15 +29,25 @@
> >  #define VMOVA     movaps
> >
> >  #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  movd d, %xmm0; \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  pxor %xmm0, %xmm0
> > +
> > +# define MEMSET_VDUP_TO_VEC0(d) \
> > +  movd d, %xmm0; \
> >    punpcklbw %xmm0, %xmm0; \
> >    punpcklwd %xmm0, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0
> >
> >  #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  movd d, %xmm0; \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  pxor %xmm0, %xmm0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > +  movd d, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0
> >
> >  #define SECTION(p)             p
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S 
> > b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > index 1af668af0a..8004a27750 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > @@ -11,13 +11,23 @@
> >  # define VMOVA     vmovdqa
> >
> >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  vmovd d, %xmm0; \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxor %xmm0, %xmm0, %xmm0
> > +
> > +# define MEMSET_VDUP_TO_VEC0(d) \
> > +  vmovd d, %xmm0; \
> >    vpbroadcastb %xmm0, %ymm0
> >
> >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  vmovd d, %xmm0; \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxor %xmm0, %xmm0, %xmm0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > +  vmovd d, %xmm0; \
> >    vpbroadcastd %xmm0, %ymm0
> >
> >  # ifndef SECTION
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S 
> > b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > index f14d6f8493..61ff9ccf6f 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > @@ -17,10 +17,20 @@
> >
> >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> > +# define MEMSET_VDUP_TO_VEC0(d) \
> >    vpbroadcastb d, %VEC0
> >
> >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0(d) \
> >    vpbroadcastd d, %VEC0
> >
> >  # define SECTION(p)            p##.evex512
> > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S 
> > b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > index 64b09e77cc..85544fb0fc 100644
> > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > @@ -17,10 +17,20 @@
> >
> >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> > +# define MEMSET_VDUP_TO_VEC0(d) \
> >    vpbroadcastb d, %VEC0
> >
> >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0(d) \
> >    vpbroadcastd d, %VEC0
> >
> >  # define SECTION(p)            p##.evex
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S 
> > b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index e723413a66..4ca34a19ba 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> >         shl     $2, %RDX_LP
> >         WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> >         jmp     L(entry_from_bzero)
> > +1:
> > +       WMEMSET_VDUP_TO_VEC0 (%esi)
> > +       jmp     L(entry_from_bzero)
> >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> >  #endif
> >
> > @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> >
> >  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > +2:
> >  # ifdef __ILP32__
> >         /* Clear the upper 32 bits.  */
> >         mov     %edx, %edx
> > @@ -137,6 +141,10 @@ L(entry_from_bzero):
> >         VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
> >         VMOVU   %VEC(0), (%rdi)
> >         VZEROUPPER_RETURN
> > +
> > +1:
> > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > +       jmp     2b
> >  #if defined USE_MULTIARCH && IS_IN (libc)
> >  END (MEMSET_SYMBOL (__memset, unaligned))
> >
> > @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, 
> > unaligned_erms))
> >
> >  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > +2:
> >  # ifdef __ILP32__
> >         /* Clear the upper 32 bits.  */
> >         mov     %edx, %edx
> > @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, 
> > unaligned_erms), 6)
> >         VMOVU   %VEC(0), (%rax)
> >         VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> >         VZEROUPPER_RETURN
> > +
> > +1:
> > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > +       jmp     2b
> >  #endif
> >
> >         .p2align 4,, 10
> > --
> > 2.33.1
> >



-- 
H.J.

Reply via email to