Em sex., 15 de nov. de 2024 às 03:46, Bertrand Drouvot < bertranddrouvot...@gmail.com> escreveu:
> Hi, > > On Fri, Nov 15, 2024 at 09:30:25AM +0900, Michael Paquier wrote: > > On Thu, Nov 14, 2024 at 12:33:20PM +0000, Bertrand Drouvot wrote: > > Anyway, as you say, the > > portability of v12 is OK even for sizeof(size_t) == 4 because we don't > > rely on any hardcoded values, and this patch does what it should in > > this case (double-checked myself manually for the three cases with > > -m32). > > Yeah, thanks for the testing! > > > > What would be unsafe on 32-bit would be to read up to 32 bytes while > len < 32 > > > and that can not happen. > > > > > > As mentioned up-thread the comments are wrong on 32-bit, indeed they > must be read > > > as: > > > > > > Case 1: len < 4 bytes > > > Case 2: len in the 4-31 bytes range > > > Case 3: len >= 32 bytes > > > > This part could be indeed better than what's proposed in v12, so I > > would recommend to use sizeof(size_t) a bit more consistently rather > > than have the reader guess that. > > Makes sense even if that looks "more difficult" to read. > > > Some comments feel duplicated, as well, like the "no risk" mentions, > > which are clear enough based on the description and the limitations of > > the previous cases. I'd like to suggest a few tweaks, making the > > comments more flexible. See 0003 that applies on top of your latest > > patch set, reattaching v12 again. > > Thanks! Applied on v13 attached, except for things like: > > " > - /* Compare bytes until the pointer "p" is aligned */ > + /* Compare bytes until the pointer "p" is aligned. */ > " > > which is adding a "." at the end of single line comments (as the few > already > part of this file don't do so). > There is a tiny typo with V13. + /* "len" in the [sizeof(size_t) * 8, inf] range */ But, I'm not sure if I'm still doing something wrong. If so, forgive me for the noise. In the v3_allzeros_check.c attached, the results is: cc -march=native -O2 v3_allzeros_check.c -o v3_allzeros_check ; ./v3_allzeros_check pagebytes[BLCKSZ-2]=1 byte per byte: is_allzeros size_t: is_allzeros SIMD v10: is_allzeros SIMD v11: is_allzeros SIMD v12: is_allzeros SIMD v14: is_allzeros Of course I expected "not is_allzeros". Anyway, I made another attempt to optimize a bit more, I don't know if it's safe though. results with v3_allzeros_small.c attached: WITH 8192 BLCKSZ Ubuntu 22.04 64 bits gcc -march=native -O2 v3_allzeros_small.c -o v3_allzeros_small ; ./v3_allzeros_small byte per byte: done in 5027744 nanoseconds size_t: done in 382521 nanoseconds (13.1437 times faster than byte per byte) SIMD v10: done in 157777 nanoseconds (31.8661 times faster than byte per byte) SIMD v11: done in 159696 nanoseconds (31.4832 times faster than byte per byte) SIMD v12: done in 168117 nanoseconds (29.9062 times faster than byte per byte) SIMD v14: done in 21008 nanoseconds (239.325 times faster than byte per byte) best regards, Ranier Vilela
#include <stdbool.h> #include <stddef.h> #include <string.h> #include <stdio.h> #include <stdint.h> #include <time.h> #include <immintrin.h> #define BLCKSZ 8192 #define LOOPS 1000 static inline bool allzeros_byte_per_byte(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool allzeros_size_t(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool pg_memory_is_all_zeros_v10(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * * There is no risk to read beyond the memory area thanks to the len < 64 * check done below. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool pg_memory_is_all_zeros_v11(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); /* * For len < 64, compare byte per byte to ensure we'll not read beyond the * memory area. */ if (len < sizeof(size_t) * 8) { while (p < end) { if (*p++ != 0) return false; } return true; } /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * * There is no risk to read beyond the memory area thanks to the len < 64 * check done below. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool pg_memory_is_all_zeros_v12(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); if (len < sizeof(size_t)) // < 8 bytes { while (p < end) { if (*p++ != 0) return false; } return true; } else if (len < sizeof(size_t) * 8) // 8-63 bytes { while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } while (p < end) { if (*p++ != 0) return false; } return true; } /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * * There is no risk to read beyond the memory area thanks to the len < 64 * check done below. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool pg_memory_is_all_zeros_simd(const size_t *p, const size_t * end) { for (; p < end; p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } return true; } static inline bool pg_memory_is_all_zeros_v14(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); if ((len >= sizeof(size_t) * 8) && (((uintptr_t) p & (sizeof(size_t) - 1)) == 0)) { return pg_memory_is_all_zeros_simd(ptr, ptr + len); } if (len < sizeof(size_t)) { while (p < end) { if (*p++ != 0) return false; } return true; } else if (len < sizeof(size_t) * 8) { while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } while (p < end) { if (*p++ != 0) return false; } return true; } /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * * There is no risk to read beyond the memory area thanks to the len < 64 * check done below. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } #define NANOSEC_PER_SEC 1000000000 // Returns difference in nanoseconds int64_t get_clock_diff(struct timespec *t1, struct timespec *t2) { int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC; nanosec += (t1->tv_nsec - t2->tv_nsec); return nanosec; } int main() { size_t pagebytes[BLCKSZ] = {0}; volatile bool result; struct timespec start,end; int64_t byte_time, size_t_time; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = allzeros_byte_per_byte(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); byte_time = get_clock_diff(&end, &start); printf("byte per byte: done in %ld nanoseconds\n", byte_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = allzeros_size_t(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("size_t: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = pg_memory_is_all_zeros_v10(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("SIMD v10: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = pg_memory_is_all_zeros_v11(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("SIMD v11: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = pg_memory_is_all_zeros_v12(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("SIMD v12: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = pg_memory_is_all_zeros_v14(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("SIMD v14: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time); return 0; }
#include <stdbool.h> #include <stddef.h> #include <string.h> #include <stdio.h> #include <stdint.h> #include <time.h> #include <immintrin.h> #define BLCKSZ 8192 #define LOOPS 1000 static inline bool allzeros_byte_per_byte(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool allzeros_size_t(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool pg_memory_is_all_zeros_v10(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * * There is no risk to read beyond the memory area thanks to the len < 64 * check done below. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool pg_memory_is_all_zeros_v11(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); /* * For len < 64, compare byte per byte to ensure we'll not read beyond the * memory area. */ if (len < sizeof(size_t) * 8) { while (p < end) { if (*p++ != 0) return false; } return true; } /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * * There is no risk to read beyond the memory area thanks to the len < 64 * check done below. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool pg_memory_is_all_zeros_v12(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); if (len < sizeof(size_t)) // < 8 bytes { while (p < end) { if (*p++ != 0) return false; } return true; } else if (len < sizeof(size_t) * 8) // 8-63 bytes { while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } while (p < end) { if (*p++ != 0) return false; } return true; } /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * * There is no risk to read beyond the memory area thanks to the len < 64 * check done below. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool pg_memory_is_all_zeros_simd(const size_t *p, const size_t * end) { for (; p < end; p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } return true; } static inline bool pg_memory_is_all_zeros_v14(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); if ((len >= sizeof(size_t) * 8) && (((uintptr_t) p & (sizeof(size_t) - 1)) == 0)) { return pg_memory_is_all_zeros_simd(ptr, ptr + len); } if (len < sizeof(size_t)) { while (p < end) { if (*p++ != 0) return false; } return true; } else if (len < sizeof(size_t) * 8) { while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } while (p < end) { if (*p++ != 0) return false; } return true; } /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * * There is no risk to read beyond the memory area thanks to the len < 64 * check done below. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } #define NANOSEC_PER_SEC 1000000000 // Returns difference in nanoseconds int64_t get_clock_diff(struct timespec *t1, struct timespec *t2) { int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC; nanosec += (t1->tv_nsec - t2->tv_nsec); return nanosec; } int main() { size_t pagebytes[BLCKSZ] = {0}; volatile bool result; pagebytes[BLCKSZ-2] = 1; printf("pagebytes[BLCKSZ-2]=%ld\n", pagebytes[BLCKSZ-2]); result = allzeros_byte_per_byte(pagebytes, BLCKSZ); printf("byte per byte: %s\n", (result?"is_allzeros":"not is allzeros")); result = allzeros_size_t(pagebytes, BLCKSZ); printf("size_t: %s\n", (result?"is_allzeros":"not is allzeros")); result = pg_memory_is_all_zeros_v10(pagebytes, BLCKSZ); printf("SIMD v10: %s\n", (result?"is_allzeros":"not is allzeros")); result = pg_memory_is_all_zeros_v11(pagebytes, BLCKSZ); printf("SIMD v11: %s\n", (result?"is_allzeros":"not is allzeros")); result = pg_memory_is_all_zeros_v12(pagebytes, BLCKSZ); printf("SIMD v12: %s\n", (result?"is_allzeros":"not is allzeros")); result = pg_memory_is_all_zeros_v14(pagebytes, BLCKSZ); printf("SIMD v14: %s\n", (result?"is_allzeros":"not is allzeros")); return 0; }