[
https://issues.apache.org/jira/browse/HDDS-14699?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Siyao Meng updated HDDS-14699:
------------------------------
Status: Patch Available (was: Open)
> Fix orphan snapshot versions handling when snapshot chain tableKey mapping is
> stale
> -----------------------------------------------------------------------------------
>
> Key: HDDS-14699
> URL: https://issues.apache.org/jira/browse/HDDS-14699
> Project: Apache Ozone
> Issue Type: Bug
> Components: Ozone Manager
> Reporter: Siyao Meng
> Assignee: Siyao Meng
> Priority: Major
>
> In isSnapshotPurged() check, snapshot chain tableKey returning null should
> not be the sole indicator for judging whether the snapshot is still active or
> not.
> isSnapshotPurged() incorrectly returning true causes
> checkOrphanSnapshotVersions() to incorrectly removing active snapshot's YAML
> metadata (in OmSnapshotLocalDataManagerService runs). This in turn causes NPE
> in CacheLoader when attempting to load the snapshot.
> OM log:
> {code}
> 2026-02-16 09:47:40,047 INFO [IPC Server handler 92 on
> 9862]-org.apache.hadoop.ozone.om.snapshot.SnapshotCache: Loading SnapshotId:
> '28d99c74-13d8-4a9d-91fe-7d0530ca84a3'
> 2026-02-16 09:47:40,050 WARN [IPC Server handler 92 on
> 9862]-org.apache.hadoop.ipc.Server: IPC Server handler 92 on 9862, call
> Call#2 Retry#0
> org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol.submitRequest from
> 10.65.50.249:55484
> java.lang.IllegalStateException: java.lang.NullPointerException
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:217)
> at
> java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1947)
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.get(SnapshotCache.java:202)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:693)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:681)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getActiveSnapshot(OmSnapshotManager.java:648)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getActiveFsMetadataOrSnapshot(OmSnapshotManager.java:638)
> at
> org.apache.hadoop.ozone.om.OzoneManager.getReader(OzoneManager.java:5013)
> at
> org.apache.hadoop.ozone.om.OzoneManager.listStatus(OzoneManager.java:3874)
> at
> org.apache.hadoop.ozone.om.OzoneManager.listStatus(OzoneManager.java:3865)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.listStatus(OzoneManagerRequestHandler.java:1196)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleReadRequest(OzoneManagerRequestHandler.java:269)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitReadRequestToOM(OzoneManagerProtocolServerSideTranslatorPB.java:245)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:198)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:158)
> at
> org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:87)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:148)
> at
> org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
> at
> org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:533)
> at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
> at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:995)
> at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:923)
> at java.base/java.security.AccessController.doPrivileged(Native Method)
> at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1910)
> at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2905)
> Caused by: java.lang.NullPointerException
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:404)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:1)
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:206)
> ... 25 more
> {code}
> Client log (client would keep retrying, but keep hitting the same NPE issue):
> {code}
> $ ozone fs -ls
> ofs://ozone1771677192/vol-test-workload-om-decommission-recommission-1771689727/buck-test-workload-om-decommission-recommission-1771689727/.snapshot/snap-79xbc/
> 26/02/21 17:42:47 INFO retry.RetryInvocationHandler:
> com.google.protobuf.ServiceException:
> org.apache.hadoop.ipc.RemoteException(java.lang.IllegalStateException):
> java.lang.NullPointerException
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:217)
> at
> java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1908)
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.get(SnapshotCache.java:202)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:693)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:681)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getActiveSnapshot(OmSnapshotManager.java:648)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getActiveFsMetadataOrSnapshot(OmSnapshotManager.java:638)
> at
> org.apache.hadoop.ozone.om.OzoneManager.getReader(OzoneManager.java:5013)
> at
> org.apache.hadoop.ozone.om.OzoneManager.getFileStatus(OzoneManager.java:3817)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.getOzoneFileStatus(OzoneManagerRequestHandler.java:1024)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleReadRequest(OzoneManagerRequestHandler.java:258)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitReadRequestToOM(OzoneManagerProtocolServerSideTranslatorPB.java:245)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:198)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:158)
> at
> org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:87)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:148)
> at
> org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
> at
> org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:533)
> at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
> at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:995)
> at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:923)
> at java.base/java.security.AccessController.doPrivileged(Native Method)
> at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1910)
> at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2905)
> Caused by: java.lang.NullPointerException
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:404)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:1)
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:206)
> ... 24 more
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]