Please the first few review comments of HBASE-16464. You can sideline the corrupt snapshots (according to master log).
You can also contact the vendor for a HOTFIX. Cheers On Sat, Feb 10, 2018 at 8:13 AM, anil gupta <anilgupt...@gmail.com> wrote: > Hi Folks, > > We are running HBase1.1.2. It seems like we are hitting > https://issues.apache.org/jira/browse/HBASE-16464 in our Production > cluster. Our oldwals folder has grown to 9.5Tb. I am aware that this is > fixed in releases after 2016 but unfortunately we need to operate this > production cluster for few more months. (We are already migrating to a > newer version of HBase). > > I have verified that we dont have any snapshots in this cluster. Also, we > removed all the replication_peers from that cluster. We have already > restarted HBase master a few days ago but it didnt help. We have TB's of > oldwal and tens of thousand of recovered edit files.(assuming recovered > edits files are cleaned up by chore cleaner). Seems like the problem > started happening around mid december but at that time we didnt do any > major thing on this cluster. > > I would like to see if there is a workaround for HBASE-16464? Is there any > references left to those deleted snapshots in hdfs or zk? If yes, how can i > clean up? > > I keep on seeing this in HMaster logs: > 2018-02-07 09:10:08,514 ERROR > [hdpmaster6.bigdataprod1.wh.truecarcorp.com,60000, > 1517601353645_ChoreService_3] > snapshot.SnapshotHFileCleaner: Exception while checking if files were > valid, keeping them just in case. > org.apache.hadoop.hbase.snapshot.CorruptedSnapshotException: Couldn't read > snapshot info > from:hdfs://PRODNN/apps/hbase/data/.hbase-snapshot/.tmp/ > LEAD_SALES-1517979610/.snapshotinfo > at > org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils. > readSnapshotInfo(SnapshotDescriptionUtils.java:313) > at > org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.getHFileNames( > SnapshotReferenceUtil.java:328) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner$1. > filesUnderSnapshot(SnapshotHFileCleaner.java:85) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache. > getSnapshotsInProgress(SnapshotFileCache.java:303) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache. > getUnreferencedFiles(SnapshotFileCache.java:194) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner. > getDeletableFiles(SnapshotHFileCleaner.java:62) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles( > CleanerChore.java:233) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteEntries( > CleanerChore.java:157) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore. > checkAndDeleteDirectory(CleanerChore.java:180) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteEntries( > CleanerChore.java:149) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore. > checkAndDeleteDirectory(CleanerChore.java:180) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteEntries( > CleanerChore.java:149) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore. > checkAndDeleteDirectory(CleanerChore.java:180) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteEntries( > CleanerChore.java:149) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore. > checkAndDeleteDirectory(CleanerChore.java:180) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteEntries( > CleanerChore.java:149) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore. > checkAndDeleteDirectory(CleanerChore.java:180) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteEntries( > CleanerChore.java:149) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore. > chore(CleanerChore.java:124) > at org.apache.hadoop.hbase.ScheduledChore.run(ScheduledChore.java:185) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ > ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run( > ScheduledThreadPoolExecutor.java:294) > at > java.util.concurrent.ThreadPoolExecutor.runWorker( > ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run( > ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > Caused by: java.io.FileNotFoundException: File does not exist: > /apps/hbase/data/.hbase-snapshot/.tmp/LEAD_SALES-1517979610/.snapshotinfo > at > org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf( > INodeFile.java:71) > at > org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf( > INodeFile.java:61) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocationsInt( > FSNamesystem.java:1828) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations( > FSNamesystem.java:1799) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations( > FSNamesystem.java:1712) > at > org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer. > getBlockLocations(NameNodeRpcServer.java:652) > at > org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSi > deTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSi > deTranslatorPB.java:365) > at > org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ > ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos. > java) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call( > ProtobufRpcEngine.java:616) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:969) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2151) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2147) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:415) > at > org.apache.hadoop.security.UserGroupInformation.doAs( > UserGroupInformation.java:1657) > at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2145) > > at sun.reflect.GeneratedConstructorAccessor22.newInstance(Unknown > Source) > at > sun.reflect.DelegatingConstructorAccessorImpl.newInstance( > DelegatingConstructorAccessorImpl.java:45) > at java.lang.reflect.Constructor.newInstance(Constructor.java:422) > at > org.apache.hadoop.ipc.RemoteException.instantiateException( > RemoteException.java:106) > at > org.apache.hadoop.ipc.RemoteException.unwrapRemoteException( > RemoteException.java:73) > at > org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations( > DFSClient.java:1242) > at > org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:1227) > at > org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:1215) > at > org.apache.hadoop.hdfs.DFSInputStream.fetchLocatedBlocksAndGetLastBl > ockLength(DFSInputStream.java:303) > at > org.apache.hadoop.hdfs.DFSInputStream.openInfo(DFSInputStream.java:269) > at org.apache.hadoop.hdfs.DFSInputStream.<init>( > DFSInputStream.java:261) > at org.apache.hadoop.hdfs.DFSClient.open(DFSClient.java:1540) > at > org.apache.hadoop.hdfs.DistributedFileSystem$3. > doCall(DistributedFileSystem.java:303) > at > org.apache.hadoop.hdfs.DistributedFileSystem$3. > doCall(DistributedFileSystem.java:299) > at > org.apache.hadoop.fs.FileSystemLinkResolver.resolve( > FileSystemLinkResolver.java:81) > at > org.apache.hadoop.hdfs.DistributedFileSystem.open( > DistributedFileSystem.java:299) > at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:767) > at > org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils. > readSnapshotInfo(SnapshotDescriptionUtils.java:306) > ... 26 more > > > -- > Thanks & Regards, > Anil Gupta >