On Tue, 5 Nov 2024 at 06:39, Ranier Vilela <ranier...@gmail.com> wrote:
> I think we can add a small optimization to this last patch [1].
> The variable *aligned_end* is only needed in the second loop (for).
> So, only before the for loop do we actually declare it.
>
> Result before this change:
> check zeros using BERTRAND 1 0.000031s
>
> Result after this change:
> check zeros using BERTRAND 1 0.000018s
>
> + const unsigned char *aligned_end;
>
> + /* Multiple bytes comparison(s) at once */
> + aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) 
> - 1)));
> + for (; p < aligned_end; p += sizeof(size_t))

I think we all need to stop using Godbolt's servers to run benchmarks
on. These servers are likely to be running various other workloads in
highly virtualised environments and are not going to be stable servers
that would give consistent benchmark results.

I tried your optimisation in the attached allzeros.c and here are my results:

# My version
$ gcc allzeros.c -O2 -o allzeros && for i in {1..3}; do ./allzeros; done
char: done in 1566400 nanoseconds
size_t: done in 195400 nanoseconds (8.01638 times faster than char)
char: done in 1537500 nanoseconds
size_t: done in 196300 nanoseconds (7.8324 times faster than char)
char: done in 1543600 nanoseconds
size_t: done in 196300 nanoseconds (7.86347 times faster than char)

# Ranier's optimization
$ gcc allzeros.c -O2 -D RANIERS_OPTIMIZATION -o allzeros && for i in
{1..3}; do ./allzeros; done
char: done in 1943100 nanoseconds
size_t: done in 531700 nanoseconds (3.6545 times faster than char)
char: done in 1957200 nanoseconds
size_t: done in 458400 nanoseconds (4.26963 times faster than char)
char: done in 1949500 nanoseconds
size_t: done in 469000 nanoseconds (4.15672 times faster than char)

Seems to be about half as fast with gcc on -O2

David
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <time.h>

#define BLCKSZ 8192
#define LOOPS 1000

static inline bool
allzeros_char(const void *ptr, size_t len)
{
        const char *p = (const char *) ptr;

        for (size_t i = 0; i < len; i++)
        {
                if (p[i] != 0)
                        return false;
        }
        return true;
}

static inline bool
allzeros_size_t(const void *ptr, size_t len)
{
        const size_t *p = (const size_t *) ptr;

        for (size_t i = 0; i < len / sizeof(size_t); i++)
        {
                if (p[i] != 0)
                        return false;
        }
        return true;
}

bool
pg_memory_is_all_zeros(const void *ptr, size_t len)
{
        const char *p = (const char *) ptr;
        const char *end = &p[len];
#ifdef RANIERS_OPTIMIZATION
        const char *aligned_end;
#else
        const char *aligned_end = (const char *) ((uintptr_t) end & 
(~(sizeof(size_t) - 1)));
#endif

        while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
        {
                if (p == end)
                        return true;

                if (*p++ != 0)
                        return false;
        }

#ifdef RANIERS_OPTIMIZATION
        aligned_end = (const char *) ((uintptr_t) end & (~(sizeof(size_t) - 
1)));
#endif
        for (; p < aligned_end; p += sizeof(size_t))
        {
                if (*(size_t *) p != 0)
                        return false;
        }

        while (p < end)
        {
                if (*p++ != 0)
                        return false;
        }

        return true;
}


#define NANOSEC_PER_SEC 1000000000

// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
        int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
        nanosec += (t1->tv_nsec - t2->tv_nsec);

        return nanosec;
}

int main()
{
        size_t pagebytes[BLCKSZ] = {0};
        volatile bool result;
        struct timespec start,end;
        int64_t char_time, size_t_time;

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

        for (int i = 0; i < LOOPS; i++)
        {
                result = allzeros_char(pagebytes, BLCKSZ);
        }

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
        char_time = get_clock_diff(&end, &start);
        printf("char: done in %ld nanoseconds\n", char_time);

#if TEST_PRE_ALIGNED_VERSION
        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

        for (int i = 0; i < LOOPS; i++)
        {
                result = allzeros_size_t(pagebytes, BLCKSZ);
        }

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
        size_t_time = get_clock_diff(&end, &start);
        printf("size_t: done in %ld nanoseconds (%g times faster than char)\n", 
size_t_time, (double) char_time / size_t_time);
#endif

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

        for (int i = 0; i < LOOPS; i++)
        {
                result = pg_memory_is_all_zeros(pagebytes, BLCKSZ);
        }

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
        size_t_time = get_clock_diff(&end, &start);
        printf("size_t: done in %ld nanoseconds (%g times faster than char)\n", 
size_t_time, (double) char_time / size_t_time);

        return 0;
}

Reply via email to