On Tue, 5 Nov 2024 at 06:39, Ranier Vilela <ranier...@gmail.com> wrote: > I think we can add a small optimization to this last patch [1]. > The variable *aligned_end* is only needed in the second loop (for). > So, only before the for loop do we actually declare it. > > Result before this change: > check zeros using BERTRAND 1 0.000031s > > Result after this change: > check zeros using BERTRAND 1 0.000018s > > + const unsigned char *aligned_end; > > + /* Multiple bytes comparison(s) at once */ > + aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) > - 1))); > + for (; p < aligned_end; p += sizeof(size_t))
I think we all need to stop using Godbolt's servers to run benchmarks on. These servers are likely to be running various other workloads in highly virtualised environments and are not going to be stable servers that would give consistent benchmark results. I tried your optimisation in the attached allzeros.c and here are my results: # My version $ gcc allzeros.c -O2 -o allzeros && for i in {1..3}; do ./allzeros; done char: done in 1566400 nanoseconds size_t: done in 195400 nanoseconds (8.01638 times faster than char) char: done in 1537500 nanoseconds size_t: done in 196300 nanoseconds (7.8324 times faster than char) char: done in 1543600 nanoseconds size_t: done in 196300 nanoseconds (7.86347 times faster than char) # Ranier's optimization $ gcc allzeros.c -O2 -D RANIERS_OPTIMIZATION -o allzeros && for i in {1..3}; do ./allzeros; done char: done in 1943100 nanoseconds size_t: done in 531700 nanoseconds (3.6545 times faster than char) char: done in 1957200 nanoseconds size_t: done in 458400 nanoseconds (4.26963 times faster than char) char: done in 1949500 nanoseconds size_t: done in 469000 nanoseconds (4.15672 times faster than char) Seems to be about half as fast with gcc on -O2 David
#include <stdbool.h> #include <stddef.h> #include <string.h> #include <stdio.h> #include <stdint.h> #include <time.h> #define BLCKSZ 8192 #define LOOPS 1000 static inline bool allzeros_char(const void *ptr, size_t len) { const char *p = (const char *) ptr; for (size_t i = 0; i < len; i++) { if (p[i] != 0) return false; } return true; } static inline bool allzeros_size_t(const void *ptr, size_t len) { const size_t *p = (const size_t *) ptr; for (size_t i = 0; i < len / sizeof(size_t); i++) { if (p[i] != 0) return false; } return true; } bool pg_memory_is_all_zeros(const void *ptr, size_t len) { const char *p = (const char *) ptr; const char *end = &p[len]; #ifdef RANIERS_OPTIMIZATION const char *aligned_end; #else const char *aligned_end = (const char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); #endif while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } #ifdef RANIERS_OPTIMIZATION aligned_end = (const char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); #endif for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } while (p < end) { if (*p++ != 0) return false; } return true; } #define NANOSEC_PER_SEC 1000000000 // Returns difference in nanoseconds int64_t get_clock_diff(struct timespec *t1, struct timespec *t2) { int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC; nanosec += (t1->tv_nsec - t2->tv_nsec); return nanosec; } int main() { size_t pagebytes[BLCKSZ] = {0}; volatile bool result; struct timespec start,end; int64_t char_time, size_t_time; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = allzeros_char(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); char_time = get_clock_diff(&end, &start); printf("char: done in %ld nanoseconds\n", char_time); #if TEST_PRE_ALIGNED_VERSION clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = allzeros_size_t(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("size_t: done in %ld nanoseconds (%g times faster than char)\n", size_t_time, (double) char_time / size_t_time); #endif clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = pg_memory_is_all_zeros(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("size_t: done in %ld nanoseconds (%g times faster than char)\n", size_t_time, (double) char_time / size_t_time); return 0; }