On Thu, Nov 07, 2024 at 08:10:17AM +1300, David Rowley wrote:
> Did you try with a size where there's a decent remainder, say 124
> bytes? FWIW, one of the cases has 112 bytes, and I think that is
> aligned memory meaning we'll do the first 64 in the SIMD loop and have
> to do 48 bytes in the byte-at-a-time loop. If you had the loop Michael
> mentioned, that would instead be 6 loops of size_t-at-a-time.

See the attached allzeros.c, based on the previous versions exchanged.
And now just imagine a structure like that:
#define BLCKSZ 48
typedef union AlignedBlock
{
    char        data[BLCKSZ];
    double      force_align_d;
    int64_t     force_align_i64;
} AlignedBlock;

This structure is optimized so as the first step to do the char step
is skipped because the pointer is aligned when allocated, and the
second step with the potential SIMD is skipped because the structure
is small enough at 48 bytes.  Hence only the last step would do the
allzero check.  Adding a size_t step to force a loop is going to be
more efficient, as proved upthread:
$ gcc -o allzeros -march=native -O2 allzeros.c
$ ./allzeros
allzeros: done in 118332297 nanoseconds
allzeros_v2: done in 13877745 nanoseconds (8.52677 times faster)

The allzero check is used for pgstat entries, and it could be possible
that some out-of-core code needs to rely on such small-ish sizes, or
even something else when a patch author feels like it.  So let's make
that optimized as much as we think we can: that's what this discussion
is about.
--
Michael
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>

#define LOOPS 10000000
#define BLCKSZ 48

typedef union AlignedBlock
{
	char		data[BLCKSZ];
	double		force_align_d;
	int64_t		force_align_i64;
} AlignedBlock;

static inline bool
isallzeros(const void *ptr, size_t len)
{
	const char *p = (const char *) ptr;
	const char *end = &p[len];
	const char *aligned_end = (const char *) ((uintptr_t) end & (~(sizeof(size_t) - 1)));

	while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
	{
		if (p == end)
			return true;
		if (*p++ != 0)
			return false;
	}
	for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
	{
		if ((((size_t *) p)[0] != 0) |
			(((size_t *) p)[1] != 0) |
			(((size_t *) p)[2] != 0) |
			(((size_t *) p)[3] != 0) |
			(((size_t *) p)[4] != 0) |
			(((size_t *) p)[5] != 0) |
			(((size_t *) p)[6] != 0) |
			(((size_t *) p)[7] != 0))
			return false;
	}
	while (p < end)
	{
		if (*p++ != 0)
			return false;
	}
	return true;
}

static inline bool
isallzeros_v2(const void *ptr, size_t len)
{
	const char *p = (const char *) ptr;
	const char *end = &p[len];
	const char *aligned_end = (const char *) ((uintptr_t) end & (~(sizeof(size_t) - 1)));

	while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
	{
		if (p == end)
			return true;
		if (*p++ != 0)
			return false;
	}
	for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
	{
		if (((size_t *) p)[0] != 0 |
			((size_t *) p)[1] != 0 |
			((size_t *) p)[2] != 0 |
			((size_t *) p)[3] != 0 |
			((size_t *) p)[4] != 0 |
			((size_t *) p)[5] != 0 |
			((size_t *) p)[6] != 0 |
			((size_t *) p)[7] != 0)
			return false;
	}
	for (; p < aligned_end; p += sizeof(size_t))
	{
		if (*(size_t *)p != 0)
			return false;
	}
	while (p < end)
	{
		if (*p++ != 0)
			return false;
	}
	return true;
}

#define NANOSEC_PER_SEC 1000000000

// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
	int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
	nanosec += (t1->tv_nsec - t2->tv_nsec);

	return nanosec;
}

int main()
{
	AlignedBlock *pagebytes;
	volatile bool result;
	struct timespec start,end;
	int64_t char_time, size_t_time;

	pagebytes = (AlignedBlock *) malloc(sizeof(AlignedBlock));
	memset(pagebytes, 0, sizeof(AlignedBlock));

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = isallzeros(pagebytes, sizeof(AlignedBlock));
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	char_time = get_clock_diff(&end, &start);
	printf("allzeros: done in %ld nanoseconds\n", char_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = isallzeros_v2(pagebytes, sizeof(AlignedBlock));
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("allzeros_v2: done in %ld nanoseconds (%g times faster)\n", size_t_time, (double) char_time / size_t_time);

	return 0;
}

Attachment: signature.asc
Description: PGP signature

Reply via email to