Em sex., 15 de nov. de 2024 às 03:46, Bertrand Drouvot <
bertranddrouvot...@gmail.com> escreveu:

> Hi,
>
> On Fri, Nov 15, 2024 at 09:30:25AM +0900, Michael Paquier wrote:
> > On Thu, Nov 14, 2024 at 12:33:20PM +0000, Bertrand Drouvot wrote:
> > Anyway, as you say, the
> > portability of v12 is OK even for sizeof(size_t) == 4 because we don't
> > rely on any hardcoded values, and this patch does what it should in
> > this case (double-checked myself manually for the three cases with
> > -m32).
>
> Yeah, thanks for the testing!
>
> > > What would be unsafe on 32-bit would be to read up to 32 bytes while
> len < 32
> > > and that can not happen.
> > >
> > > As mentioned up-thread the comments are wrong on 32-bit, indeed they
> must be read
> > > as:
> > >
> > > Case 1: len < 4 bytes
> > > Case 2: len in the 4-31 bytes range
> > > Case 3: len >= 32 bytes
> >
> > This part could be indeed better than what's proposed in v12, so I
> > would recommend to use sizeof(size_t) a bit more consistently rather
> > than have the reader guess that.
>
> Makes sense even if that looks "more difficult" to read.
>
> > Some comments feel duplicated, as well, like the "no risk" mentions,
> > which are clear enough based on the description and the limitations of
> > the previous cases.  I'd like to suggest a few tweaks, making the
> > comments more flexible.  See 0003 that applies on top of your latest
> > patch set, reattaching v12 again.
>
> Thanks! Applied on v13 attached, except for things like:
>
> "
> -       /* Compare bytes until the pointer "p" is aligned */
> +       /* Compare bytes until the pointer "p" is aligned. */
> "
>
> which is adding a "." at the end of single line comments (as the few
> already
> part of this file don't do so).
>
There is a tiny typo with V13.
+ /* "len" in the [sizeof(size_t) * 8, inf] range */

But, I'm not sure if I'm still doing something wrong.
If so, forgive me for the noise.

In the v3_allzeros_check.c attached,
the results is:
cc -march=native -O2 v3_allzeros_check.c -o v3_allzeros_check ;
./v3_allzeros_check
pagebytes[BLCKSZ-2]=1
byte per byte: is_allzeros
size_t: is_allzeros
SIMD v10: is_allzeros
SIMD v11: is_allzeros
SIMD v12: is_allzeros
SIMD v14: is_allzeros

Of course I expected "not is_allzeros".

Anyway, I made another attempt to optimize a bit more, I don't know if it's
safe though.
results with v3_allzeros_small.c attached:
WITH 8192 BLCKSZ
Ubuntu 22.04 64 bits

gcc -march=native -O2 v3_allzeros_small.c -o v3_allzeros_small ;
./v3_allzeros_small
byte per byte: done in 5027744 nanoseconds
size_t: done in 382521 nanoseconds (13.1437 times faster than byte per byte)
SIMD v10: done in 157777 nanoseconds (31.8661 times faster than byte per
byte)
SIMD v11: done in 159696 nanoseconds (31.4832 times faster than byte per
byte)
SIMD v12: done in 168117 nanoseconds (29.9062 times faster than byte per
byte)
SIMD v14: done in 21008 nanoseconds (239.325 times faster than byte per
byte)

best regards,
Ranier Vilela
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <immintrin.h>

#define BLCKSZ 8192
#define LOOPS 1000

static inline bool
allzeros_byte_per_byte(const void *ptr, size_t len)
{
	const unsigned char *p = (const unsigned char *) ptr;
	const unsigned char *end = &p[len];

	while (p < end)
	{
		if (*p++ != 0)
			return false;
	}
	return true;
}

static inline bool
allzeros_size_t(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}

static inline bool
pg_memory_is_all_zeros_v10(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_v11(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /*
     * For len < 64, compare byte per byte to ensure we'll not read beyond the
     * memory area.
     */
    if (len < sizeof(size_t) * 8)
    {
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }


    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_v12(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    if (len < sizeof(size_t))  // < 8 bytes
    {
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }
    else if (len < sizeof(size_t) * 8)  // 8-63 bytes
    {
        while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
        {
            if (p == end)
                return true;
            if (*p++ != 0)
                return false;
        }
        
        for (; p < aligned_end; p += sizeof(size_t))
        {
            if (*(size_t *) p != 0)
                return false;
        }
        
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_simd(const size_t *p, const size_t * end)
{
    for (; p < end; p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_v14(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    if ((len >= sizeof(size_t) * 8) && (((uintptr_t) p & (sizeof(size_t) - 1)) == 0))
    {
        return pg_memory_is_all_zeros_simd(ptr, ptr + len);
    }

    if (len < sizeof(size_t))
    {
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }
    else if (len < sizeof(size_t) * 8)
    {
        while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
        {
            if (p == end)
                return true;
            if (*p++ != 0)
                return false;
        }
        
        for (; p < aligned_end; p += sizeof(size_t))
        {
            if (*(size_t *) p != 0)
                return false;
        }
        
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *

     * For performance reasons, we manually unroll this loop and purposefully

     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access

     * all 8 elements regardless of the result of the other comparisons.  This

     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *

     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}



#define NANOSEC_PER_SEC 1000000000

// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
	int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
	nanosec += (t1->tv_nsec - t2->tv_nsec);

	return nanosec;
}

int main()
{
	size_t pagebytes[BLCKSZ] = {0};
	volatile bool result;
	struct timespec start,end;
	int64_t byte_time, size_t_time;

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = allzeros_byte_per_byte(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	byte_time = get_clock_diff(&end, &start);
	printf("byte per byte: done in %ld nanoseconds\n", byte_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = allzeros_size_t(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("size_t: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros_v10(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("SIMD v10: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros_v11(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("SIMD v11: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros_v12(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("SIMD v12: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros_v14(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("SIMD v14: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	return 0;
}

#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <immintrin.h>

#define BLCKSZ 8192
#define LOOPS 1000

static inline bool
allzeros_byte_per_byte(const void *ptr, size_t len)
{
	const unsigned char *p = (const unsigned char *) ptr;
	const unsigned char *end = &p[len];

	while (p < end)
	{
		if (*p++ != 0)
			return false;
	}
	return true;
}

static inline bool
allzeros_size_t(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}

static inline bool
pg_memory_is_all_zeros_v10(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_v11(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /*
     * For len < 64, compare byte per byte to ensure we'll not read beyond the
     * memory area.
     */
    if (len < sizeof(size_t) * 8)
    {
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }


    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_v12(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    if (len < sizeof(size_t))  // < 8 bytes
    {
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }
    else if (len < sizeof(size_t) * 8)  // 8-63 bytes
    {
        while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
        {
            if (p == end)
                return true;
            if (*p++ != 0)
                return false;
        }
        
        for (; p < aligned_end; p += sizeof(size_t))
        {
            if (*(size_t *) p != 0)
                return false;
        }
        
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_simd(const size_t *p, const size_t * end)
{
    for (; p < end; p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_v14(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    if ((len >= sizeof(size_t) * 8) && (((uintptr_t) p & (sizeof(size_t) - 1)) == 0))
    {
        return pg_memory_is_all_zeros_simd(ptr, ptr + len);
    }

    if (len < sizeof(size_t))
    {
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }
    else if (len < sizeof(size_t) * 8)
    {
        while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
        {
            if (p == end)
                return true;
            if (*p++ != 0)
                return false;
        }
        
        for (; p < aligned_end; p += sizeof(size_t))
        {
            if (*(size_t *) p != 0)
                return false;
        }
        
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *

     * For performance reasons, we manually unroll this loop and purposefully

     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access

     * all 8 elements regardless of the result of the other comparisons.  This

     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *

     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}



#define NANOSEC_PER_SEC 1000000000

// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
	int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
	nanosec += (t1->tv_nsec - t2->tv_nsec);

	return nanosec;
}

int main()
{
	size_t pagebytes[BLCKSZ] = {0};
	volatile bool result;
  
        pagebytes[BLCKSZ-2] = 1;

        printf("pagebytes[BLCKSZ-2]=%ld\n", pagebytes[BLCKSZ-2]);

        result = allzeros_byte_per_byte(pagebytes, BLCKSZ);
        printf("byte per byte: %s\n", (result?"is_allzeros":"not is allzeros"));

        result = allzeros_size_t(pagebytes, BLCKSZ);
        printf("size_t: %s\n", (result?"is_allzeros":"not is allzeros"));

        result = pg_memory_is_all_zeros_v10(pagebytes, BLCKSZ);
        printf("SIMD v10: %s\n", (result?"is_allzeros":"not is allzeros"));

        result = pg_memory_is_all_zeros_v11(pagebytes, BLCKSZ);
        printf("SIMD v11: %s\n", (result?"is_allzeros":"not is allzeros"));

	result = pg_memory_is_all_zeros_v12(pagebytes, BLCKSZ);
        printf("SIMD v12: %s\n", (result?"is_allzeros":"not is allzeros"));

	result = pg_memory_is_all_zeros_v14(pagebytes, BLCKSZ);
        printf("SIMD v14: %s\n", (result?"is_allzeros":"not is allzeros"));

	return 0;
}

Reply via email to