This is an automated email from the ASF dual-hosted git repository.
dlmarion pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/2.1 by this push:
new 041e4c1612 Changed Halt to print msg to stderr (#5574)
041e4c1612 is described below
commit 041e4c1612fe12df775d17f5af2839df70f4b584
Author: Dave Marion <[email protected]>
AuthorDate: Wed May 28 11:54:33 2025 -0400
Changed Halt to print msg to stderr (#5574)
Logging that happens in proximity to a Runtime.halt
may not make it to the log before the process shuts
down if an asynchronous logging implementation is used.
This change modifies the Halt methods to print any log
message to stderr and flushing before logging to the
logging implementation.
Closes #5545
---
.../core/fate/zookeeper/ServiceLockSupport.java | 23 +++++-------
.../java/org/apache/accumulo/core/util/Halt.java | 41 +++++++++++++---------
.../org/apache/accumulo/server/AbstractServer.java | 2 +-
.../accumulo/server/GarbageCollectionLogger.java | 2 +-
.../accumulo/server/manager/LiveTServerSet.java | 4 +--
.../apache/accumulo/server/rpc/TServerUtils.java | 2 +-
.../accumulo/server/util/FileSystemMonitor.java | 3 +-
.../accumulo/tserver/TabletClientHandler.java | 11 +++---
.../org/apache/accumulo/tserver/TabletServer.java | 3 +-
.../accumulo/tserver/log/TabletServerLogger.java | 14 ++++----
.../accumulo/tserver/tablet/MinorCompactor.java | 2 +-
.../org/apache/accumulo/tserver/tablet/Tablet.java | 2 +-
12 files changed, 53 insertions(+), 56 deletions(-)
diff --git
a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java
b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java
index cd5c400e01..e4514e0808 100644
---
a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java
+++
b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java
@@ -56,16 +56,13 @@ public class ServiceLockSupport {
LOG.warn("{} lost lock (reason = {}), not halting because shutdown is
complete.",
serviceName, reason);
} else {
- Halt.halt(serviceName + " lock in zookeeper lost (reason = " + reason
+ "), exiting!", -1);
+ Halt.halt(-1, serviceName + " lock in zookeeper lost (reason = " +
reason + "), exiting!");
}
}
@Override
public void unableToMonitorLockNode(final Exception e) {
- // ACCUMULO-3651 Changed level to error and added FATAL to message for
slf4j compatibility
- Halt.halt(-1,
- () -> LOG.error("FATAL: No longer able to monitor {} lock node",
serviceName, e));
-
+ Halt.halt(-1, "FATAL: No longer able to monitor " + serviceName + " lock
node", e);
}
@Override
@@ -73,7 +70,7 @@ public class ServiceLockSupport {
LOG.debug("Acquired {} lock", serviceName);
if (acquiredLock || failedToAcquireLock) {
- Halt.halt("Zoolock in unexpected state AL " + acquiredLock + " " +
failedToAcquireLock, -1);
+ Halt.halt(-1, "Zoolock in unexpected state AL " + acquiredLock + " " +
failedToAcquireLock);
}
acquiredLock = true;
@@ -88,12 +85,12 @@ public class ServiceLockSupport {
String msg =
"Failed to acquire " + serviceName + " lock due to incorrect
ZooKeeper authentication.";
LOG.error("{} Ensure instance.secret is consistent across Accumulo
configuration", msg, e);
- Halt.halt(msg, -1);
+ Halt.halt(-1, msg);
}
if (acquiredLock) {
- Halt.halt("Zoolock in unexpected state acquiredLock true with FAL " +
failedToAcquireLock,
- -1);
+ Halt.halt(-1,
+ "Zoolock in unexpected state acquiredLock true with FAL " +
failedToAcquireLock);
}
failedToAcquireLock = true;
@@ -145,16 +142,14 @@ public class ServiceLockSupport {
LOG.warn("{} lost lock (reason = {}), not halting because shutdown is
complete.",
serviceName, reason);
} else {
- Halt.halt(1, () -> {
- LOG.error("{} lost lock (reason = {}), exiting.", serviceName,
reason);
- lostLockAction.accept(serviceName);
- });
+ Halt.halt(1, serviceName + " lost lock (reason = " + reason + "),
exiting.",
+ () -> lostLockAction.accept(serviceName));
}
}
@Override
public void unableToMonitorLockNode(final Exception e) {
- Halt.halt(1, () -> LOG.error("Lost ability to monitor {} lock,
exiting.", serviceName, e));
+ Halt.halt(1, "Lost ability to monitor " + serviceName + " lock,
exiting.", e);
}
}
diff --git a/core/src/main/java/org/apache/accumulo/core/util/Halt.java
b/core/src/main/java/org/apache/accumulo/core/util/Halt.java
index ec822b48cc..1b10b92f16 100644
--- a/core/src/main/java/org/apache/accumulo/core/util/Halt.java
+++ b/core/src/main/java/org/apache/accumulo/core/util/Halt.java
@@ -28,34 +28,43 @@ import org.slf4j.LoggerFactory;
public class Halt {
private static final Logger log = LoggerFactory.getLogger(Halt.class);
- public static void halt(final String msg) {
- // ACCUMULO-3651 Changed level to error and added FATAL to message for
slf4j compatibility
- halt(0, new Runnable() {
- @Override
- public void run() {
- log.error("FATAL {}", msg);
- }
- });
+ public static void halt(final int status, final String msg) {
+ halt(status, msg, null, null);
}
- public static void halt(final String msg, int status) {
- halt(status, new Runnable() {
- @Override
- public void run() {
- log.error("FATAL {}", msg);
- }
- });
+ public static void halt(final int status, final String msg, final Throwable
exception) {
+ halt(status, msg, exception, null);
}
- public static void halt(final int status, Runnable runnable) {
+ public static void halt(final int status, final String msg, final Runnable
runnable) {
+ halt(status, msg, null, runnable);
+ }
+ public static void halt(final int status, final String msg, final Throwable
exception,
+ final Runnable runnable) {
try {
+
// give ourselves a little time to try and do something
Threads.createThread("Halt Thread", () -> {
sleepUninterruptibly(100, MILLISECONDS);
Runtime.getRuntime().halt(status);
}).start();
+ final String errorMessage = "FATAL " + msg;
+ // Printing to stderr and to the log in case the message does not make
+ // it to the log. This could happen if an asynchronous logging impl is
used
+ System.err.println(errorMessage);
+ if (exception != null) {
+ exception.printStackTrace();
+ }
+ System.err.flush();
+
+ if (exception != null) {
+ log.error(errorMessage, exception);
+ } else {
+ log.error(errorMessage);
+ }
+
if (runnable != null) {
runnable.run();
}
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java
b/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java
index 20f7db5b12..4bee11be06 100644
--- a/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java
+++ b/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java
@@ -251,7 +251,7 @@ public abstract class AbstractServer
log.trace(
"ServiceLockVerificationThread - checking ServiceLock
existence in ZooKeeper");
if (lock != null && !lock.verifyLockAtSource()) {
- Halt.halt("Lock verification thread could not find lock",
-1);
+ Halt.halt(-1, "Lock verification thread could not find
lock");
}
// Need to sleep, not yield when the thread priority is
greater than NORM_PRIORITY
// so that this thread does not get immediately rescheduled.
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java
b/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java
index 469ec5632d..d3de0fa837 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java
@@ -117,7 +117,7 @@ public class GarbageCollectionLogger {
}
if (maxIncreaseInCollectionTime > keepAliveTimeout) {
- Halt.halt("Garbage collection may be interfering with lock keep-alive.
Halting.", -1);
+ Halt.halt(-1, "Garbage collection may be interfering with lock
keep-alive. Halting.");
}
lastMemorySize = mem;
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java
b/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java
index 572c5954c1..1757e29580 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java
@@ -465,9 +465,7 @@ public class LiveTServerSet implements Watcher {
context.getZooReaderWriter().recursiveDelete(fullpath, SKIP);
} catch (Exception e) {
String msg = "error removing tablet server lock";
- // ACCUMULO-3651 Changed level to error and added FATAL to message for
slf4j compatibility
- log.error("FATAL: {}", msg, e);
- Halt.halt(msg, -1);
+ Halt.halt(-1, msg, e);
}
getZooCache().clear(fullpath);
}
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java
b/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java
index 22a191a7a6..332e286a3f 100644
--- a/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java
+++ b/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java
@@ -664,7 +664,7 @@ public class TServerUtils {
try {
finalServer.serve();
} catch (Error e) {
- Halt.halt("Unexpected error in TThreadPoolServer " + e + ", halting.",
1);
+ Halt.halt(1, "Unexpected error in TThreadPoolServer " + e + ",
halting.", e);
}
}).start();
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java
b/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java
index 12d39bbf25..90a9c810ee 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java
@@ -121,8 +121,7 @@ public class FileSystemMonitor {
try {
checkMount(mount);
} catch (final Exception e) {
- Halt.halt(-42,
- () -> log.error("Exception while checking mount points,
halting process", e));
+ Halt.halt(1, "Exception while checking mount points, halting
process", e);
}
}));
}
diff --git
a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java
b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java
index 1188bbdf1d..deb5b8df2c 100644
---
a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java
+++
b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java
@@ -1155,11 +1155,9 @@ public class TabletClientHandler implements
TabletClientService.Iface {
log.trace("Got {} message from user: {}", request,
credentials.getPrincipal());
if (server.getLock() != null && server.getLock().wasLockAcquired()
&& !server.getLock().isLocked()) {
- Halt.halt(1, () -> {
- log.info("Tablet server no longer holds lock during checkPermission()
: {}, exiting",
- request);
- server.getGcLogger().logGCInfo(server.getConfiguration());
- });
+ Halt.halt(1,
+ "Tablet server no longer holds lock during checkPermission() : " +
request + ", exiting",
+ () -> server.getGcLogger().logGCInfo(server.getConfiguration()));
}
if (lock != null) {
@@ -1344,8 +1342,7 @@ public class TabletClientHandler implements
TabletClientService.Iface {
checkPermission(context, server, credentials, lock, "halt");
- Halt.halt(0, () -> {
- log.info("Manager requested tablet server halt");
+ Halt.halt(0, "Manager requested tablet server halt", () -> {
server.gcLogger.logGCInfo(server.getConfiguration());
server.requestStop();
try {
diff --git
a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
index 4ef6d7781f..bee674f795 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
@@ -946,8 +946,7 @@ public class TabletServer extends AbstractServer
iface.tabletServerStopping(TraceUtil.traceInfo(),
getContext().rpcCreds(),
getClientAddressString());
} catch (TException e) {
- LOG.error("Error informing Manager that we are shutting down, halting
server", e);
- Halt.halt("Error informing Manager that we are shutting down, exiting!",
-1);
+ Halt.halt(-1, "Error informing Manager that we are shutting down,
exiting!", e);
} finally {
returnManagerConnection(iface);
}
diff --git
a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
index 46970b4c21..5cc5c26e56 100644
---
a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
+++
b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
@@ -250,9 +250,8 @@ public class TabletServerLogger {
throw new RuntimeException(e);
}
} else {
- log.error("Repeatedly failed to create WAL. Going to exit
tabletserver.", t);
// We didn't have retries or we failed too many times.
- Halt.halt("Experienced too many errors creating WALs, giving up", 1);
+ Halt.halt(1, "Experienced too many errors creating WALs, giving up",
t);
}
// The exception will trigger the log creation to be re-attempted.
@@ -395,7 +394,7 @@ public class TabletServerLogger {
boolean success = false;
while (!success) {
- boolean sawWriteFailure = false;
+ Throwable sawWriteFailure = null;
try {
// get a reference to the loggers that no other thread can touch
AtomicInteger currentId = new AtomicInteger(-1);
@@ -450,7 +449,7 @@ public class TabletServerLogger {
writeRetry.logRetry(log, "Logs closed while writing", ex);
} catch (Exception t) {
writeRetry.logRetry(log, "Failed to write to WAL", t);
- sawWriteFailure = true;
+ sawWriteFailure = t;
try {
// Backoff
writeRetry.waitForNextAttempt(log, "write to WAL");
@@ -467,10 +466,11 @@ public class TabletServerLogger {
final int finalCurrent = currentLogId;
if (!success) {
final ServiceLock tabletServerLock = tserver.getLock();
- if (sawWriteFailure) {
- log.info("WAL write failure, validating server lock in ZooKeeper");
+ if (sawWriteFailure != null) {
+ log.info("WAL write failure, validating server lock in ZooKeeper",
sawWriteFailure);
if (tabletServerLock == null ||
!tabletServerLock.verifyLockAtSource()) {
- Halt.halt("Writing to WAL has failed and TabletServer lock does
not exist", -1);
+ Halt.halt(-1, "Writing to WAL has failed and TabletServer lock
does not exist",
+ sawWriteFailure);
}
}
diff --git
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java
index 71258af11d..f265056c6a 100644
---
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java
+++
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java
@@ -103,7 +103,7 @@ public class MinorCompactor extends FileCompactor {
if (tserverLock == null || !tserverLock.verifyLockAtSource()) {
log.error("Minor compaction of {} has failed and TabletServer
lock does not exist."
+ " Halting...", getExtent(), e);
- Halt.halt("TabletServer lock does not exist", -1);
+ Halt.halt(-1, "TabletServer lock does not exist", e);
} else {
throw e;
}
diff --git
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
index 246622d277..0f8773d93a 100644
---
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
+++
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
@@ -516,7 +516,7 @@ public class Tablet extends TabletBase {
if (tserverLock == null || !tserverLock.verifyLockAtSource()) {
log.error("Minor compaction of {} has failed and TabletServer lock
does not exist."
+ " Halting...", getExtent(), e);
- Halt.halt("TabletServer lock does not exist", -1);
+ Halt.halt(-1, "TabletServer lock does not exist", e);
} else {
TraceUtil.setException(span2, e, true);
throw e;