On Wed, Oct 26, 2011 at 09:52, Heikki Linnakangas <heikki.linnakan...@enterprisedb.com> wrote: > (CC'ing pgsql-hackers, this started as an IM discussion yesterday but really > belongs in the archives) > > On 25.10.2011 23:52, Magnus Hagander wrote: >>> >>> There's a tiny chance to get incomplete xlog files with pg_receivexlog if >>> you crash: >>> 1. pg_receivexlog finishes write()ing a file but system crashes before >>> fsync() finishes. >>> 2. When pg_receivexlog restarts after crash, the last WAL file was not >>> fully flushed to disk, with >>> holes in the middle, but it has the right length. pg_receivexlog will >>> continue streaming from the next file. >>> not sure if we care about such a narrow window, but maybe we do >> >> So how would we go about fixing that? Always unlink the last file in >> the directory and try from there would seem dangerous too - what if >> it's not available on the master anymore, then we might have given up >> on data... > > Start streaming from the beginning of the last segment, but don't unlink it > first. Just overwrite it as you receive the data. > > Or, always create new xlog file as "0000000100000001000000D3.partial", and > only when it's fully written, fsync it, and then rename it to > "0000000100000001000000D3". Then you know that if a file doesn't have the > .partial suffix, it's complete. The fact that the last partial file always > has the .partial suffix needs some extra pushups in the restore_command, > though.
Here's a version that does this. Turns out this requires a lot less code than what was previously in there, which is always nice. We still need to solve the other part which is how to deal with the partial files on restore. But this is definitely a cleaner way from a pure pg_receivexlog perspective. Comments/reviews? -- Magnus Hagander Me: http://www.hagander.net/ Work: http://www.redpill-linpro.com/
*** a/src/bin/pg_basebackup/pg_receivexlog.c --- b/src/bin/pg_basebackup/pg_receivexlog.c *************** *** 71,104 **** usage(void) static bool segment_callback(XLogRecPtr segendpos, uint32 timeline) { - char fn[MAXPGPATH]; - struct stat statbuf; - if (verbose) fprintf(stderr, _("%s: finished segment at %X/%X (timeline %u)\n"), progname, segendpos.xlogid, segendpos.xrecoff, timeline); /* - * Check if there is a partial file for the name we just finished, and if - * there is, remove it under the assumption that we have now got all the - * data we need. - */ - segendpos.xrecoff /= XLOG_SEG_SIZE; - PrevLogSeg(segendpos.xlogid, segendpos.xrecoff); - snprintf(fn, sizeof(fn), "%s/%08X%08X%08X.partial", - basedir, timeline, - segendpos.xlogid, - segendpos.xrecoff); - if (stat(fn, &statbuf) == 0) - { - /* File existed, get rid of it */ - if (verbose) - fprintf(stderr, _("%s: removing file \"%s\"\n"), - progname, fn); - unlink(fn); - } - - /* * Never abort from this - we handle all aborting in continue_streaming() */ return false; --- 71,81 ---- *************** *** 133,139 **** FindStreamingStart(XLogRecPtr currentpos, uint32 currenttimeline) bool b; uint32 high_log = 0; uint32 high_seg = 0; - bool partial = false; dir = opendir(basedir); if (dir == NULL) --- 110,115 ---- *************** *** 195,201 **** FindStreamingStart(XLogRecPtr currentpos, uint32 currenttimeline) disconnect_and_exit(1); } ! if (statbuf.st_size == 16 * 1024 * 1024) { /* Completed segment */ if (log > high_log || --- 171,177 ---- disconnect_and_exit(1); } ! if (statbuf.st_size == XLOG_SEG_SIZE) { /* Completed segment */ if (log > high_log || *************** *** 208,244 **** FindStreamingStart(XLogRecPtr currentpos, uint32 currenttimeline) } else { ! /* ! * This is a partial file. Rename it out of the way. ! */ ! char newfn[MAXPGPATH]; ! ! fprintf(stderr, _("%s: renaming partial file \"%s\" to \"%s.partial\"\n"), ! progname, dirent->d_name, dirent->d_name); ! ! snprintf(newfn, sizeof(newfn), "%s/%s.partial", ! basedir, dirent->d_name); ! ! if (stat(newfn, &statbuf) == 0) ! { ! /* ! * XXX: perhaps we should only error out if the existing file ! * is larger? ! */ ! fprintf(stderr, _("%s: file \"%s\" already exists. Check and clean up manually.\n"), ! progname, newfn); ! disconnect_and_exit(1); ! } ! if (rename(fullpath, newfn) != 0) ! { ! fprintf(stderr, _("%s: could not rename \"%s\" to \"%s\": %s\n"), ! progname, fullpath, newfn, strerror(errno)); ! disconnect_and_exit(1); ! } ! ! /* Don't continue looking for more, we assume this is the last */ ! partial = true; ! break; } } --- 184,192 ---- } else { ! fprintf(stderr, _("%s: segment file '%s' is incorrect size %d, skipping\n"), ! progname, dirent->d_name, (int) statbuf.st_size); ! continue; } } *************** *** 247,263 **** FindStreamingStart(XLogRecPtr currentpos, uint32 currenttimeline) if (high_log > 0 || high_seg > 0) { XLogRecPtr high_ptr; ! ! if (!partial) ! { ! /* ! * If the segment was partial, the pointer is already at the right ! * location since we want to re-transmit that segment. If it was ! * not, we need to move it to the next segment, since we are ! * tracking the last one that was complete. ! */ ! NextLogSeg(high_log, high_seg); ! } high_ptr.xlogid = high_log; high_ptr.xrecoff = high_seg * XLOG_SEG_SIZE; --- 195,205 ---- if (high_log > 0 || high_seg > 0) { XLogRecPtr high_ptr; ! /* ! * Move the starting pointer to the start of the next segment, ! * since the highest one we've seen was completed. ! */ ! NextLogSeg(high_log, high_seg); high_ptr.xlogid = high_log; high_ptr.xrecoff = high_seg * XLOG_SEG_SIZE; *** a/src/bin/pg_basebackup/receivelog.c --- b/src/bin/pg_basebackup/receivelog.c *************** *** 51,64 **** open_walfile(XLogRecPtr startpoint, uint32 timeline, char *basedir, char *namebu XLogFileName(namebuf, timeline, startpoint.xlogid, startpoint.xrecoff / XLOG_SEG_SIZE); ! snprintf(fn, sizeof(fn), "%s/%s", basedir, namebuf); ! f = open(fn, O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, 0666); if (f == -1) fprintf(stderr, _("%s: Could not open WAL segment %s: %s\n"), ! progname, namebuf, strerror(errno)); return f; } /* * Local version of GetCurrentTimestamp(), since we are not linked with * backend code. --- 51,114 ---- XLogFileName(namebuf, timeline, startpoint.xlogid, startpoint.xrecoff / XLOG_SEG_SIZE); ! snprintf(fn, sizeof(fn), "%s/%s.partial", basedir, namebuf); ! f = open(fn, O_WRONLY | O_CREAT | PG_BINARY, 0666); if (f == -1) fprintf(stderr, _("%s: Could not open WAL segment %s: %s\n"), ! progname, fn, strerror(errno)); ! return f; } + static bool + close_walfile(int walfile, char *basedir, char *walname) + { + off_t size = lseek(walfile, 0, SEEK_END); + + if (size == -1) + { + fprintf(stderr, _("%s: could not get size of written file %s: %s\n"), + progname, walname, strerror(errno)); + return false; + } + + if (fsync(walfile) != 0) + { + fprintf(stderr, _("%s: could not fsync file %s: %s\n"), + progname, walname, strerror(errno)); + return false; + } + + if (close(walfile) != 0) + { + fprintf(stderr, _("%s: could not close file %s: %s\n"), + progname, walname, strerror(errno)); + return false; + } + + /* Rename the .partial file only if it's 16Mb */ + if (size == XLOG_SEG_SIZE) + { + char oldfn[MAXPGPATH]; + char newfn[MAXPGPATH]; + + snprintf(oldfn, sizeof(oldfn), "%s/%s.partial", basedir, walname); + snprintf(newfn, sizeof(newfn), "%s/%s", basedir, walname); + if (rename(oldfn, newfn) != 0) + { + fprintf(stderr, _("%s: could not rename file %s: %s\n"), + progname, walname, strerror(errno)); + return false; + } + } + else + fprintf(stderr, _("%s: not renaming %s, segment is not complete.\n"), + progname, walname); + + return true; + } + + /* * Local version of GetCurrentTimestamp(), since we are not linked with * backend code. *************** *** 178,187 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi if (stream_continue && stream_continue()) { if (walfile != -1) ! { ! fsync(walfile); ! close(walfile); ! } return true; } --- 228,235 ---- if (stream_continue && stream_continue()) { if (walfile != -1) ! /* Potential error message is written by close_walfile */ ! return close_walfile(walfile, basedir, current_walfile_name); return true; } *************** *** 360,367 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi /* Did we reach the end of a WAL segment? */ if (blockpos.xrecoff % XLOG_SEG_SIZE == 0) { ! fsync(walfile); ! close(walfile); walfile = -1; xlogoff = 0; --- 408,417 ---- /* Did we reach the end of a WAL segment? */ if (blockpos.xrecoff % XLOG_SEG_SIZE == 0) { ! if (!close_walfile(walfile, basedir, current_walfile_name)) ! /* Error message written in close_walfile() */ ! return false; ! walfile = -1; xlogoff = 0;
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers