On Fri, Dec 31, 2021 at 12:21 PM Noah Goldstein <goldstein....@gmail.com> wrote: > > On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower > > lantency and higher throughput than VPBROADCAST, for zero constant. > > Since the most common usage of memset is to zero a block of memory, the > > branch predictor will make the compare/jmp basically free and PXOR is > > almost like being executed unconditionally. > > Any benchmark results? Is the broadcast on the critical path for any size?
Can you run your workloads to see how many memset calls are zeroing? > Also imagine the vast majority of memset zero are compile time known. > > I think it might make more sense to give bzero() the fall-through instead and bzero is an alias of SSE2 memset in glibc. Should we add __memsetzero like __memcmpeq? It should be almost free in glibc. GCC can use __memsetzero if it is available. > add a patch in GCC to prefer bzero > memset. > > > > --- > > sysdeps/x86_64/memset.S | 14 ++++++++++++-- > > .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++++++++++++-- > > .../multiarch/memset-avx512-unaligned-erms.S | 10 ++++++++++ > > .../x86_64/multiarch/memset-evex-unaligned-erms.S | 10 ++++++++++ > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 13 +++++++++++++ > > 5 files changed, 57 insertions(+), 4 deletions(-) > > > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > > index 0137eba4cd..513f9c703d 100644 > > --- a/sysdeps/x86_64/memset.S > > +++ b/sysdeps/x86_64/memset.S > > @@ -29,15 +29,25 @@ > > #define VMOVA movaps > > > > #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > - movd d, %xmm0; \ > > movq r, %rax; \ > > + testl d, d; \ > > + jnz 1f; \ > > + pxor %xmm0, %xmm0 > > + > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > + movd d, %xmm0; \ > > punpcklbw %xmm0, %xmm0; \ > > punpcklwd %xmm0, %xmm0; \ > > pshufd $0, %xmm0, %xmm0 > > > > #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > - movd d, %xmm0; \ > > movq r, %rax; \ > > + testl d, d; \ > > + jnz 1f; \ > > + pxor %xmm0, %xmm0 > > + > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > + movd d, %xmm0; \ > > pshufd $0, %xmm0, %xmm0 > > > > #define SECTION(p) p > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > index 1af668af0a..8004a27750 100644 > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > @@ -11,13 +11,23 @@ > > # define VMOVA vmovdqa > > > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > - vmovd d, %xmm0; \ > > movq r, %rax; \ > > + testl d, d; \ > > + jnz 1f; \ > > + vpxor %xmm0, %xmm0, %xmm0 > > + > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > + vmovd d, %xmm0; \ > > vpbroadcastb %xmm0, %ymm0 > > > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > - vmovd d, %xmm0; \ > > movq r, %rax; \ > > + testl d, d; \ > > + jnz 1f; \ > > + vpxor %xmm0, %xmm0, %xmm0 > > + > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > + vmovd d, %xmm0; \ > > vpbroadcastd %xmm0, %ymm0 > > > > # ifndef SECTION > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > index f14d6f8493..61ff9ccf6f 100644 > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > @@ -17,10 +17,20 @@ > > > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > movq r, %rax; \ > > + testl d, d; \ > > + jnz 1f; \ > > + vpxorq %XMM0, %XMM0, %XMM0 > > + > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > vpbroadcastb d, %VEC0 > > > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > movq r, %rax; \ > > + testl d, d; \ > > + jnz 1f; \ > > + vpxorq %XMM0, %XMM0, %XMM0 > > + > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > vpbroadcastd d, %VEC0 > > > > # define SECTION(p) p##.evex512 > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > index 64b09e77cc..85544fb0fc 100644 > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > @@ -17,10 +17,20 @@ > > > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > movq r, %rax; \ > > + testl d, d; \ > > + jnz 1f; \ > > + vpxorq %XMM0, %XMM0, %XMM0 > > + > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > vpbroadcastb d, %VEC0 > > > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > movq r, %rax; \ > > + testl d, d; \ > > + jnz 1f; \ > > + vpxorq %XMM0, %XMM0, %XMM0 > > + > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > vpbroadcastd d, %VEC0 > > > > # define SECTION(p) p##.evex > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > index e723413a66..4ca34a19ba 100644 > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > > shl $2, %RDX_LP > > WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > jmp L(entry_from_bzero) > > +1: > > + WMEMSET_VDUP_TO_VEC0 (%esi) > > + jmp L(entry_from_bzero) > > END (WMEMSET_SYMBOL (__wmemset, unaligned)) > > #endif > > > > @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > > > > ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > > MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > +2: > > # ifdef __ILP32__ > > /* Clear the upper 32 bits. */ > > mov %edx, %edx > > @@ -137,6 +141,10 @@ L(entry_from_bzero): > > VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > > VMOVU %VEC(0), (%rdi) > > VZEROUPPER_RETURN > > + > > +1: > > + MEMSET_VDUP_TO_VEC0 (%esi) > > + jmp 2b > > #if defined USE_MULTIARCH && IS_IN (libc) > > END (MEMSET_SYMBOL (__memset, unaligned)) > > > > @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, > > unaligned_erms)) > > > > ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > > MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > +2: > > # ifdef __ILP32__ > > /* Clear the upper 32 bits. */ > > mov %edx, %edx > > @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, > > unaligned_erms), 6) > > VMOVU %VEC(0), (%rax) > > VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) > > VZEROUPPER_RETURN > > + > > +1: > > + MEMSET_VDUP_TO_VEC0 (%esi) > > + jmp 2b > > #endif > > > > .p2align 4,, 10 > > -- > > 2.33.1 > > -- H.J.