Simon Riggs <si...@2ndquadrant.com> writes:
> Please post the patch rather than fixing directly. There's some subtle
> stuff there and it would be best to discuss first.

And here's a proposed patch for not throwing ERROR during replay of DROP
TABLESPACE.  I had originally thought this would be a one-liner
s/ERROR/LOG/, but on inspection destroy_tablespace_directories() really
needs to be changed too, so that it doesn't throw error for unremovable
directories.

                        regards, tom lane

diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 8ed6d06bb2370a54349f2a86ee13e08a381eacf2..ed0c9ea3ee5a372c6926f41fc0fc93ed7d80d6cd 100644
*** a/src/backend/commands/tablespace.c
--- b/src/backend/commands/tablespace.c
*************** create_tablespace_directories(const char
*** 626,634 ****
  /*
   * destroy_tablespace_directories
   *
!  * Attempt to remove filesystem infrastructure
   *
!  * 'redo' indicates we are redoing a drop from XLOG; okay if nothing there
   *
   * Returns TRUE if successful, FALSE if some subdirectory is not empty
   */
--- 626,638 ----
  /*
   * destroy_tablespace_directories
   *
!  * Attempt to remove filesystem infrastructure for the tablespace.
   *
!  * 'redo' indicates we are redoing a drop from XLOG; in that case we should
!  * not throw an ERROR for problems, just LOG them.  The worst consequence of
!  * not removing files here would be failure to release some disk space, which
!  * does not justify throwing an error that would require manual intervention
!  * to get the database running again.
   *
   * Returns TRUE if successful, FALSE if some subdirectory is not empty
   */
*************** destroy_tablespace_directories(Oid table
*** 681,686 ****
--- 685,700 ----
  			pfree(linkloc_with_version_dir);
  			return true;
  		}
+ 		else if (redo)
+ 		{
+ 			/* in redo, just log other types of error */
+ 			ereport(LOG,
+ 					(errcode_for_file_access(),
+ 					 errmsg("could not open directory \"%s\": %m",
+ 							linkloc_with_version_dir)));
+ 			pfree(linkloc_with_version_dir);
+ 			return false;
+ 		}
  		/* else let ReadDir report the error */
  	}
  
*************** destroy_tablespace_directories(Oid table
*** 704,710 ****
  
  		/* remove empty directory */
  		if (rmdir(subfile) < 0)
! 			ereport(ERROR,
  					(errcode_for_file_access(),
  					 errmsg("could not remove directory \"%s\": %m",
  							subfile)));
--- 718,724 ----
  
  		/* remove empty directory */
  		if (rmdir(subfile) < 0)
! 			ereport(redo ? LOG : ERROR,
  					(errcode_for_file_access(),
  					 errmsg("could not remove directory \"%s\": %m",
  							subfile)));
*************** destroy_tablespace_directories(Oid table
*** 716,738 ****
  
  	/* remove version directory */
  	if (rmdir(linkloc_with_version_dir) < 0)
! 		ereport(ERROR,
  				(errcode_for_file_access(),
  				 errmsg("could not remove directory \"%s\": %m",
  						linkloc_with_version_dir)));
  
  	/*
  	 * Try to remove the symlink.  We must however deal with the possibility
  	 * that it's a directory instead of a symlink --- this could happen during
  	 * WAL replay (see TablespaceCreateDbspace), and it is also the case on
  	 * Windows where junction points lstat() as directories.
  	 */
  	linkloc = pstrdup(linkloc_with_version_dir);
  	get_parent_directory(linkloc);
  	if (lstat(linkloc, &st) == 0 && S_ISDIR(st.st_mode))
  	{
  		if (rmdir(linkloc) < 0)
! 			ereport(ERROR,
  					(errcode_for_file_access(),
  					 errmsg("could not remove directory \"%s\": %m",
  							linkloc)));
--- 730,759 ----
  
  	/* remove version directory */
  	if (rmdir(linkloc_with_version_dir) < 0)
! 	{
! 		ereport(redo ? LOG : ERROR,
  				(errcode_for_file_access(),
  				 errmsg("could not remove directory \"%s\": %m",
  						linkloc_with_version_dir)));
+ 		pfree(linkloc_with_version_dir);
+ 		return false;
+ 	}
  
  	/*
  	 * Try to remove the symlink.  We must however deal with the possibility
  	 * that it's a directory instead of a symlink --- this could happen during
  	 * WAL replay (see TablespaceCreateDbspace), and it is also the case on
  	 * Windows where junction points lstat() as directories.
+ 	 *
+ 	 * Note: in the redo case, we'll return true if this final step fails;
+ 	 * there's no point in retrying it.
  	 */
  	linkloc = pstrdup(linkloc_with_version_dir);
  	get_parent_directory(linkloc);
  	if (lstat(linkloc, &st) == 0 && S_ISDIR(st.st_mode))
  	{
  		if (rmdir(linkloc) < 0)
! 			ereport(redo ? LOG : ERROR,
  					(errcode_for_file_access(),
  					 errmsg("could not remove directory \"%s\": %m",
  							linkloc)));
*************** destroy_tablespace_directories(Oid table
*** 740,746 ****
  	else
  	{
  		if (unlink(linkloc) < 0)
! 			ereport(ERROR,
  					(errcode_for_file_access(),
  					 errmsg("could not remove symbolic link \"%s\": %m",
  							linkloc)));
--- 761,767 ----
  	else
  	{
  		if (unlink(linkloc) < 0)
! 			ereport(redo ? LOG : ERROR,
  					(errcode_for_file_access(),
  					 errmsg("could not remove symbolic link \"%s\": %m",
  							linkloc)));
*************** tblspc_redo(XLogRecPtr lsn, XLogRecord *
*** 1451,1464 ****
  		xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
  
  		/*
! 		 * If we issued a WAL record for a drop tablespace it is because there
! 		 * were no files in it at all. That means that no permanent objects
! 		 * can exist in it at this point.
  		 *
  		 * It is possible for standby users to be using this tablespace as a
  		 * location for their temporary files, so if we fail to remove all
  		 * files then do conflict processing and try again, if currently
  		 * enabled.
  		 */
  		if (!destroy_tablespace_directories(xlrec->ts_id, true))
  		{
--- 1472,1490 ----
  		xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
  
  		/*
! 		 * If we issued a WAL record for a drop tablespace it implies that
! 		 * there were no files in it at all when the DROP was done. That means
! 		 * that no permanent objects can exist in it at this point.
  		 *
  		 * It is possible for standby users to be using this tablespace as a
  		 * location for their temporary files, so if we fail to remove all
  		 * files then do conflict processing and try again, if currently
  		 * enabled.
+ 		 *
+ 		 * Other possible reasons for failure include bollixed file permissions
+ 		 * on a standby server when they were okay on the primary, etc etc.
+ 		 * There's not much we can do about that, so just remove what we can
+ 		 * and press on.
  		 */
  		if (!destroy_tablespace_directories(xlrec->ts_id, true))
  		{
*************** tblspc_redo(XLogRecPtr lsn, XLogRecord *
*** 1466,1480 ****
  
  			/*
  			 * If we did recovery processing then hopefully the backends who
! 			 * wrote temp files should have cleaned up and exited by now. So
! 			 * lets recheck before we throw an error. If !process_conflicts
! 			 * then this will just fail again.
  			 */
  			if (!destroy_tablespace_directories(xlrec->ts_id, true))
! 				ereport(ERROR,
  						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
  						 errmsg("tablespace %u is not empty",
! 								xlrec->ts_id)));
  		}
  	}
  	else
--- 1492,1509 ----
  
  			/*
  			 * If we did recovery processing then hopefully the backends who
! 			 * wrote temp files should have cleaned up and exited by now.  So
! 			 * retry before complaining.  If we fail again, this is just a LOG
! 			 * condition, because it's not worth throwing an ERROR for (as
! 			 * that would crash the database and require manual intervention
! 			 * before we could get past this WAL record on restart).
  			 */
  			if (!destroy_tablespace_directories(xlrec->ts_id, true))
! 				ereport(LOG,
  						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
  						 errmsg("tablespace %u is not empty",
! 								xlrec->ts_id),
! 						 errdetail("Continuing even though some files could not be removed.")));
  		}
  	}
  	else
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to