On Thu, Nov 07, 2024 at 08:10:17AM +1300, David Rowley wrote: > Did you try with a size where there's a decent remainder, say 124 > bytes? FWIW, one of the cases has 112 bytes, and I think that is > aligned memory meaning we'll do the first 64 in the SIMD loop and have > to do 48 bytes in the byte-at-a-time loop. If you had the loop Michael > mentioned, that would instead be 6 loops of size_t-at-a-time.
See the attached allzeros.c, based on the previous versions exchanged. And now just imagine a structure like that: #define BLCKSZ 48 typedef union AlignedBlock { char data[BLCKSZ]; double force_align_d; int64_t force_align_i64; } AlignedBlock; This structure is optimized so as the first step to do the char step is skipped because the pointer is aligned when allocated, and the second step with the potential SIMD is skipped because the structure is small enough at 48 bytes. Hence only the last step would do the allzero check. Adding a size_t step to force a loop is going to be more efficient, as proved upthread: $ gcc -o allzeros -march=native -O2 allzeros.c $ ./allzeros allzeros: done in 118332297 nanoseconds allzeros_v2: done in 13877745 nanoseconds (8.52677 times faster) The allzero check is used for pgstat entries, and it could be possible that some out-of-core code needs to rely on such small-ish sizes, or even something else when a patch author feels like it. So let's make that optimized as much as we think we can: that's what this discussion is about. -- Michael
#include <stdbool.h> #include <stddef.h> #include <string.h> #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <time.h> #define LOOPS 10000000 #define BLCKSZ 48 typedef union AlignedBlock { char data[BLCKSZ]; double force_align_d; int64_t force_align_i64; } AlignedBlock; static inline bool isallzeros(const void *ptr, size_t len) { const char *p = (const char *) ptr; const char *end = &p[len]; const char *aligned_end = (const char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool isallzeros_v2(const void *ptr, size_t len) { const char *p = (const char *) ptr; const char *end = &p[len]; const char *aligned_end = (const char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if (((size_t *) p)[0] != 0 | ((size_t *) p)[1] != 0 | ((size_t *) p)[2] != 0 | ((size_t *) p)[3] != 0 | ((size_t *) p)[4] != 0 | ((size_t *) p)[5] != 0 | ((size_t *) p)[6] != 0 | ((size_t *) p)[7] != 0) return false; } for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *)p != 0) return false; } while (p < end) { if (*p++ != 0) return false; } return true; } #define NANOSEC_PER_SEC 1000000000 // Returns difference in nanoseconds int64_t get_clock_diff(struct timespec *t1, struct timespec *t2) { int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC; nanosec += (t1->tv_nsec - t2->tv_nsec); return nanosec; } int main() { AlignedBlock *pagebytes; volatile bool result; struct timespec start,end; int64_t char_time, size_t_time; pagebytes = (AlignedBlock *) malloc(sizeof(AlignedBlock)); memset(pagebytes, 0, sizeof(AlignedBlock)); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = isallzeros(pagebytes, sizeof(AlignedBlock)); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); char_time = get_clock_diff(&end, &start); printf("allzeros: done in %ld nanoseconds\n", char_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = isallzeros_v2(pagebytes, sizeof(AlignedBlock)); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("allzeros_v2: done in %ld nanoseconds (%g times faster)\n", size_t_time, (double) char_time / size_t_time); return 0; }
signature.asc
Description: PGP signature