Hi,

On Wed, Nov 13, 2024 at 09:25:37AM +0900, Michael Paquier wrote:
> So that seems worth the addition, especially for
> smaller sizes where this is 6 times faster here.

So, something like v12 in pg_memory_is_all_zeros_v12() in allzeros_small.c
attached?

If so, that gives us:

== with BLCKSZ 32

$ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c -o 
allzeros_small ; ./allzeros_small
byte per byte: done in 22421 nanoseconds
size_t: done in 7269 nanoseconds (3.08447 times faster than byte per byte)
SIMD v10: done in 6349 nanoseconds (3.53142 times faster than byte per byte)
SIMD v11: done in 22080 nanoseconds (1.01544 times faster than byte per byte)
SIMD v12: done in 5595 nanoseconds (4.00733 times faster than byte per byte)

== with BLCKSZ 63

$ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c -o 
allzeros_small ; ./allzeros_small
byte per byte: done in 29525 nanoseconds
size_t: done in 11232 nanoseconds (2.62865 times faster than byte per byte)
SIMD v10: done in 10828 nanoseconds (2.72673 times faster than byte per byte)
SIMD v11: done in 42056 nanoseconds (0.70204 times faster than byte per byte)
SIMD v12: done in 10468 nanoseconds (2.8205 times faster than byte per byte)

== with BLCKSZ 256

$ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c -o 
allzeros_small ; ./allzeros_small
byte per byte: done in 120483 nanoseconds
size_t: done in 23098 nanoseconds (5.21617 times faster than byte per byte)
SIMD v10: done in 6737 nanoseconds (17.8838 times faster than byte per byte)
SIMD v11: done in 6621 nanoseconds (18.1971 times faster than byte per byte)
SIMD v12: done in 6519 nanoseconds (18.4818 times faster than byte per byte)

== with BLCKSZ 8192

$ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c -o 
allzeros_small ; ./allzeros_small
byte per byte: done in 3393459 nanoseconds
size_t: done in 707304 nanoseconds (4.79774 times faster than byte per byte)
SIMD v10: done in 233559 nanoseconds (14.5293 times faster than byte per byte)
SIMD v11: done in 225951 nanoseconds (15.0186 times faster than byte per byte)
SIMD v12: done in 225766 nanoseconds (15.0309 times faster than byte per byte)

That's better for small size but given the extra len checks that
has been added I think we're back to David's point in [1]: What if the function
is not inlined for some reason?

So, out of curiosity, let's see what happens if not inlined in [2] (see the
-O2 -DNOT_INLINE compiler window):

- if a[3]: it looks like gcc is smart enough to create an optimized version
for that size using constant propagation
- if a[63]: Same as above
- if a[256]: Same as above
- if a[8192]: Same as above

I did a quick check with clang and it looks like it is not as smart as gcc
for the non inline case.

Anyway it's not like we have the choice: we need (at least) one len check for
safety reason (to not crash or read invalid data).

So, I'd vote for pg_memory_is_all_zeros_v12() then, thoughts?

[1]: 
https://www.postgresql.org/message-id/CAApHDvp2jx_%3DpFbgj-O1_ZmzP9WOZKfwLzZrS_%3DZmbsqMQQ59g%40mail.gmail.com
[2]: https://godbolt.org/z/8s44GKqcc 

Regards,

-- 
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <immintrin.h>

#define BLCKSZ 32
#define LOOPS 1000

static inline bool
allzeros_byte_per_byte(const void *ptr, size_t len)
{
	const unsigned char *p = (const unsigned char *) ptr;
	const unsigned char *end = &p[len];

	while (p < end)
	{
		if (*p++ != 0)
			return false;
	}
	return true;
}

static inline bool
allzeros_size_t(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}

static inline bool
pg_memory_is_all_zeros_v10(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_v11(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /*
     * For len < 64, compare byte per byte to ensure we'll not read beyond the
     * memory area.
     */
    if (len < sizeof(size_t) * 8)
    {
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }


    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}


static inline bool
pg_memory_is_all_zeros_v12(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    if (len < sizeof(size_t))  // < 8 bytes
    {
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }

    if (len < sizeof(size_t) * 8)  // 8-63 bytes
    {
        while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
        {
            if (p == end)
                return true;
            if (*p++ != 0)
                return false;
        }
        
        for (; p < aligned_end; p += sizeof(size_t))
        {
            if (*(size_t *) p != 0)
                return false;
        }
        
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}

#define NANOSEC_PER_SEC 1000000000

// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
	int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
	nanosec += (t1->tv_nsec - t2->tv_nsec);

	return nanosec;
}

int main()
{
	size_t pagebytes[BLCKSZ] = {0};
	volatile bool result;
	struct timespec start,end;
	int64_t byte_time, size_t_time;

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = allzeros_byte_per_byte(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	byte_time = get_clock_diff(&end, &start);
	printf("byte per byte: done in %ld nanoseconds\n", byte_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = allzeros_size_t(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("size_t: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros_v10(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("SIMD v10: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros_v11(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("SIMD v11: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros_v12(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("SIMD v12: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	return 0;
}

Reply via email to