On Sun, Apr 9, 2023 at 4:52 PM Thomas Munro <thomas.mu...@gmail.com> wrote:
> Here, btrfs seems to be taking a different path that I can't quite
> make out...  I see no warning/error about a checksum failure like [1],
> and we apparently managed to read something other than a mix of the
> old and new page contents (which, based on your hypothesis, should
> just leave it indeterminate whether the hint bit changes were captured
> or not, and the rest of the page should be stable, right).  It's like
> the page time-travelled or got scrambled in some other way, but it
> didn't tell us?  I'll try to dig further...

I think there are two separate bad phenomena.

1.  A concurrent modification of the user space buffer while writing
breaks the checksum so you can't read the data back in, as .  I can
reproduce that with a stand-alone program, attached.  The "verifier"
process occasionally reports EIO while reading, unless you comment out
the "scribbler" process's active line.  The system log/dmesg gets some
warnings.

2.  The crake-style failure doesn't involve any reported checksum
failures or errors, and I'm not sure if another process is even
involved.  I attach a complete syscall trace of a repro session.  (I
tried to get strace to dump 8192 byte strings, but then it doesn't
repro, so we have only the start of the data transferred for each
page.)  Working back from the error message,

ERROR:  invalid page in block 78 of relation base/5/16384,

we have a page at offset 638976, and we can find all system calls that
touched that offset:

[pid 26031] 23:26:48.521123 pwritev(50,
[{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
iov_len=8192}], 1, 638976) = 8192

[pid 26040] 23:26:48.568975 pwrite64(5,
"\0\0\0\0\0Nj\1\0\0\0\0\240\3\300\3\0 \4
\0\0\0\0\340\2378\0\300\2378\0"..., 8192, 638976) = 8192

[pid 26040] 23:26:48.593157 pread64(6,
"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
8192, 638976) = 8192

In between the write of non-zeros and the read of zeros, nothing seems
to happen that could justify that, that I can grok, but perhaps
someone else will see something that I'm missing.  We pretty much just
have the parallel worker scanning the table, and writing stuff out as
it does it.  This was obtained with:

strace -f --absolute-timestamps=time,us ~/install/bin/postgres -D
pgdata -c io_direct=data -c shared_buffers=256kB -c wal_level=minimal
-c max_wal_senders=0 2>&1 | tee trace.log

The repro is just:

set debug_parallel_query=regress;
drop table if exists t;
create table t as select generate_series(1, 10000);
update t set generate_series = 1;
select count(*) from t;

Occasionally it fails in a different way: after create table t, later
references to t can't find it in the catalogs but there is no invalid
page error.  Perhaps the freaky zeros are happening one 4k page at a
time but perhaps if you get two in a row it might look like an empty
catalog page and pass validation.

Attachment: repro-strace.log.gz
Description: application/gzip

#define _GNU_SOURCE

#include <fcntl.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <unistd.h>

#define FLAGS (O_RDWR | O_CLOEXEC | O_DIRECT)
#define BUFFER_SIZE 8192
#define ALIGN 4096
#define LOOPS 1000000

#define PROCESS_WRITER 1
#define PROCESS_SCRIBBLER 2
#define PROCESS_VERIFIER 3

struct shared_data {
	uint8_t write_buffer[BUFFER_SIZE];
	int blockno;
	pthread_barrier_t barrier;
};

static int
run_process(int which, struct shared_data *shared)
{
	int fd;

	fd = open("file", FLAGS, 0644);
	if (fd < 0) {
		perror("open");
		return 1;
	}

	for (int i = 0; i < LOOPS; i++) {

		if (which == PROCESS_WRITER) {
			shared->blockno = rand() % 88;
			memset(shared->write_buffer, 0x00, sizeof(shared->write_buffer));
			if (pwrite(fd, shared->write_buffer, BUFFER_SIZE, BUFFER_SIZE * shared->blockno) < BUFFER_SIZE)
				perror("writer: initialization pwrite failed or short");
			memset(shared->write_buffer, 0xff, BUFFER_SIZE);
		}

		pthread_barrier_wait(&shared->barrier);

		if (which == PROCESS_WRITER) {
			if (pwrite(fd, shared->write_buffer, BUFFER_SIZE, BUFFER_SIZE * shared->blockno) < BUFFER_SIZE)
				perror("writer: main pwrite failed or short");
		} else if (which == PROCESS_SCRIBBLER) {
			for (int j = 0; j < 3; j++)
				shared->write_buffer[rand() % BUFFER_SIZE] = 0x42;
		}

		pthread_barrier_wait(&shared->barrier);

		if (which == PROCESS_VERIFIER) {
			uint8_t read_buffer[BUFFER_SIZE];

			if (pread(fd, read_buffer, BUFFER_SIZE, BUFFER_SIZE * shared->blockno) < BUFFER_SIZE) {
				perror("pread failed or short while verifying");
			} else {
				for (int i = 0; i < BUFFER_SIZE; i++) {
					if (read_buffer[i] == 0xff)
						continue;
					if (read_buffer[i] == 0x42)
						continue;
					printf("got unexpected byte %d at position %d in block %d (stopped looking after that)\n",
						read_buffer[i], i, shared->blockno);
					break;
				}
			}
		}

		pthread_barrier_wait(&shared->barrier);
	}

	return 0;
}

int
main(int argc, char *argv[])
{
	pid_t writer_pid;
	pid_t scribbler_pid;
	pid_t verifier_pid;
	char *memory;
	struct shared_data *shared;
    pthread_barrierattr_t barrier_attr;

	memory = mmap(NULL, sizeof(struct shared_data) + ALIGN,
		PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
	if (memory == MAP_FAILED) {
		perror("mmap");
		return EXIT_FAILURE;
	}

	shared = (struct shared_data *)
		((((uintptr_t) memory + ALIGN - 1) / ALIGN) * ALIGN);

    pthread_barrierattr_setpshared(&barrier_attr, PTHREAD_PROCESS_SHARED);
	pthread_barrier_init(&shared->barrier, &barrier_attr, 3);

	writer_pid = fork();
	if (writer_pid == 0)
		return run_process(PROCESS_WRITER, shared);
	verifier_pid = fork();
	if (verifier_pid == 0)
		return run_process(PROCESS_VERIFIER, shared);
	run_process(PROCESS_SCRIBBLER, shared);

	waitpid(verifier_pid, NULL, 0);
	waitpid(writer_pid, NULL, 0);
	return EXIT_SUCCESS;
}

Reply via email to