On Fri, 1 Nov 2024 at 19:27, Michael Paquier <mich...@paquier.xyz> wrote:
> Under gcc -O2 or -O3, the single-byte check or the 8-byte check don't
> make a difference.  Please see the attached (allzeros.txt) for a quick
> check if you want to check by yourself.  With 1M iterations, both
> average around 3ms for 1M iterations on my laptop (not the fastest
> thing around).
>
> Under -O0, though, the difference is noticeable:
> - 1-byte check: 3.52s for 1M iterations, averaging one check at
> 3.52ns.
> - 8-byte check: 0.46s for 1M iterations, averaging one check at
> 0.46ns.
>
> Even for that, I doubt that this is going to be noticeable in
> practice, still the difference exists.

The reason you're not seeing the slowdown with -O2 and -O3 is because
your compiler didn't think there was anything to do so didn't emit the
code you were trying to benchmark.  Try looking at allzeros.s after
doing "gcc allzeros.c -S -O2".

I've attached an updated version for you to try. I used a volatile
bool and assigned the function result to it to prevent the compiler
from optimising out the test.

$ gcc allzeros.c -O2 -o allzeros
$ ./allzeros
char: done in 1607800 nanoseconds
size_t: done in 208800 nanoseconds (7.70019 times faster)

$ gcc allzeros.c -O3 -o allzeros
$ ./allzeros
char: done in 1584500 nanoseconds
size_t: done in 225700 nanoseconds (7.02038 times faster)

David
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <time.h>

#define BLCKSZ 8192
#define LOOPS 1000

static inline bool
allzeros_char(const void *ptr, size_t len)
{
        const char *p = (const char *) ptr;

        for (size_t i = 0; i < len; i++)
        {
                if (p[i] != 0)
                        return false;
        }
        return true;
}

static inline bool
allzeros_size_t(const void *ptr, size_t len)
{
        const size_t *p = (const size_t *) ptr;

        for (size_t i = 0; i < len / sizeof(size_t); i++)
        {
                if (p[i] != 0)
                        return false;
        }
        return true;
}

#define NANOSEC_PER_SEC 1000000000

// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
        int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
        nanosec += (t1->tv_nsec - t2->tv_nsec);

        return nanosec;
}

int main()
{
        size_t pagebytes[BLCKSZ] = {0};
        volatile bool result;
        struct timespec start,end;
        int64_t char_time, size_t_time;

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

        for (int i = 0; i < LOOPS; i++)
        {
                result = allzeros_char(pagebytes, BLCKSZ);
        }

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
        char_time = get_clock_diff(&end, &start);
        printf("char: done in %ld nanoseconds\n", char_time);

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

        for (int i = 0; i < LOOPS; i++)
        {
                result = allzeros_size_t(pagebytes, BLCKSZ);
                }

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
        size_t_time = get_clock_diff(&end, &start);
        printf("size_t: done in %ld nanoseconds (%g times faster)\n", 
size_t_time, (double) char_time / size_t_time);   

        return 0;
}

Reply via email to