This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new f424d261b7 [core] Add FileType enum for classifying Paimon files
(#7613)
f424d261b7 is described below
commit f424d261b76753c44af0d9ed35a9342604980d08
Author: LsomeYeah <[email protected]>
AuthorDate: Fri Apr 10 13:08:50 2026 +0800
[core] Add FileType enum for classifying Paimon files (#7613)
---
.../apache/paimon/consumer/ConsumerManager.java | 2 +-
.../org/apache/paimon/schema/SchemaManager.java | 2 +-
.../java/org/apache/paimon/utils/FileType.java | 111 ++++++++++
.../java/org/apache/paimon/utils/TagManager.java | 2 +-
.../java/org/apache/paimon/utils/FileTypeTest.java | 243 +++++++++++++++++++++
5 files changed, 357 insertions(+), 3 deletions(-)
diff --git
a/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java
b/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java
index 8a2ad90ff7..c6ed06f5fe 100644
--- a/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java
+++ b/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java
@@ -45,7 +45,7 @@ public class ConsumerManager implements Serializable {
private static final long serialVersionUID = 1L;
- private static final String CONSUMER_PREFIX = "consumer-";
+ public static final String CONSUMER_PREFIX = "consumer-";
private final FileIO fileIO;
private final Path tablePath;
diff --git
a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java
b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java
index 8667f2271d..549c0e96a8 100644
--- a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java
+++ b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java
@@ -109,7 +109,7 @@ import static
org.apache.paimon.utils.Preconditions.checkState;
@ThreadSafe
public class SchemaManager implements Serializable {
- private static final String SCHEMA_PREFIX = "schema-";
+ public static final String SCHEMA_PREFIX = "schema-";
private final FileIO fileIO;
private final Path tableRoot;
diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/FileType.java
b/paimon-core/src/main/java/org/apache/paimon/utils/FileType.java
new file mode 100644
index 0000000000..963780cbfd
--- /dev/null
+++ b/paimon-core/src/main/java/org/apache/paimon/utils/FileType.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.utils;
+
+import org.apache.paimon.consumer.ConsumerManager;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.io.DataFilePathFactory;
+import org.apache.paimon.schema.SchemaManager;
+import org.apache.paimon.service.ServiceManager;
+
+/**
+ * Classification of Paimon files.
+ *
+ * <ul>
+ * <li>{@link #META}: snapshot, schema, manifest, statistics, tag, changelog
metadata, hint files,
+ * _SUCCESS, consumer, service files
+ * <li>{@link #DATA}: data files and any unrecognized files (default)
+ * <li>{@link #BUCKET_INDEX}: bucket level index files (Hash, DV)
+ * <li>{@link #GLOBAL_INDEX}: table level global index files (btree, bitmap,
lumina, tantivy)
+ * <li>{@link #FILE_INDEX}: data-file index files (bloom filter, bitmap,
etc.)
+ * </ul>
+ */
+public enum FileType {
+ META,
+ DATA,
+ BUCKET_INDEX,
+ GLOBAL_INDEX,
+ FILE_INDEX;
+
+ private static final String MANIFEST = "manifest";
+ private static final String CHANGELOG_DIR = "changelog";
+ private static final String GLOBAL_INDEX_INFIX = "global-index-";
+
+ /** Returns {@code true} if this file type is any kind of index. */
+ public boolean isIndex() {
+ return this == BUCKET_INDEX || this == GLOBAL_INDEX || this ==
FILE_INDEX;
+ }
+
+ /**
+ * Classify a file based on its full path.
+ *
+ * <p>When the file does not match any known pattern, it defaults to
{@link #DATA}.
+ */
+ public static FileType classify(Path filePath) {
+ String name = filePath.getName();
+
+ // meta file prefixes: snapshot-, schema-, stat-, tag-, consumer-,
service-
+ if (name.startsWith(SnapshotManager.SNAPSHOT_PREFIX)
+ || name.startsWith(SchemaManager.SCHEMA_PREFIX)
+ || name.startsWith(FileStorePathFactory.STATISTICS_PREFIX)
+ || name.startsWith(TagManager.TAG_PREFIX)
+ || name.startsWith(ConsumerManager.CONSUMER_PREFIX)
+ || name.startsWith(ServiceManager.SERVICE_PREFIX)) {
+ return META;
+ }
+
+ // file index: {data-file}.index (e.g. data-xxx.orc.index)
+ // must check before global index since global index also ends with
".index"
+ if (name.endsWith(DataFilePathFactory.INDEX_PATH_SUFFIX)) {
+ if (name.contains(GLOBAL_INDEX_INFIX)) {
+ return GLOBAL_INDEX;
+ }
+ return FILE_INDEX;
+ }
+
+ // manifest, manifest-list, index-manifest: name contains "manifest"
+ if (name.contains(MANIFEST)) {
+ return META;
+ }
+
+ // bucket index: name starts with "index-" (e.g. index-{uuid}-{N})
+ if (name.startsWith(FileStorePathFactory.INDEX_PREFIX)) {
+ return BUCKET_INDEX;
+ }
+
+ // hint files
+ if ("EARLIEST".equals(name) || "LATEST".equals(name)) {
+ return META;
+ }
+
+ // success files
+ if ("_SUCCESS".equals(name) || name.endsWith("_SUCCESS")) {
+ return META;
+ }
+
+ // changelog metadata: parent dir is "changelog" and name starts with
"changelog-"
+ if (name.startsWith(ChangelogManager.CHANGELOG_PREFIX)
+ && CHANGELOG_DIR.equals(filePath.getParent().getName())) {
+ return META;
+ }
+
+ // default: DATA
+ return DATA;
+ }
+}
diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java
b/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java
index 703b93e145..4a0a18e51e 100644
--- a/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java
+++ b/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java
@@ -62,7 +62,7 @@ public class TagManager {
private static final Logger LOG =
LoggerFactory.getLogger(TagManager.class);
- private static final String TAG_PREFIX = "tag-";
+ public static final String TAG_PREFIX = "tag-";
private final FileIO fileIO;
private final Path tablePath;
diff --git
a/paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java
b/paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java
new file mode 100644
index 0000000000..423fe3ba5f
--- /dev/null
+++ b/paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.utils;
+
+import org.apache.paimon.fs.Path;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link FileType}. */
+public class FileTypeTest {
+
+ private static final String TABLE_ROOT =
"hdfs://cluster/warehouse/db.db/table";
+
+ // ===== META files =====
+
+ @Test
+ public void testMetaFiles() {
+ // snapshot
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/snapshot/snapshot-1")))
+ .isEqualTo(FileType.META);
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/snapshot/snapshot-100")))
+ .isEqualTo(FileType.META);
+ // schema
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/schema/schema-0")))
+ .isEqualTo(FileType.META);
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/schema/schema-5")))
+ .isEqualTo(FileType.META);
+ // manifest
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/manifest/manifest-a1b2c3d4-0")))
+ .isEqualTo(FileType.META);
+ // manifest-list
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/manifest/manifest-list-a1b2c3d4-0")))
+ .isEqualTo(FileType.META);
+ // index-manifest
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/manifest/index-manifest-a1b2c3d4-0")))
+ .isEqualTo(FileType.META);
+ // statistics
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/statistics/stat-a1b2c3d4-0")))
+ .isEqualTo(FileType.META);
+ // tag
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/tag/tag-2024-01-01")))
+ .isEqualTo(FileType.META);
+ assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-myTag")))
+ .isEqualTo(FileType.META);
+ // changelog metadata
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/changelog/changelog-1")))
+ .isEqualTo(FileType.META);
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/changelog/changelog-100")))
+ .isEqualTo(FileType.META);
+ // hint files
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/snapshot/EARLIEST")))
+ .isEqualTo(FileType.META);
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/snapshot/LATEST")))
+ .isEqualTo(FileType.META);
+ // success files
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/dt=2024-01-01/bucket-0/_SUCCESS")))
+ .isEqualTo(FileType.META);
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/tag/tag-success-file/myTag_SUCCESS")))
+ .isEqualTo(FileType.META);
+ // consumer
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/consumer/consumer-myGroup")))
+ .isEqualTo(FileType.META);
+ // service
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/service/service-primary-key-lookup")))
+ .isEqualTo(FileType.META);
+ }
+
+ // ===== BUCKET_INDEX files =====
+
+ @Test
+ public void testBucketIndexFiles() {
+ // under /index/ dir
+ assertThat(FileType.classify(new Path(TABLE_ROOT +
"/index/index-a1b2c3d4-0")))
+ .isEqualTo(FileType.BUCKET_INDEX);
+ // under bucket dir
+ assertThat(
+ FileType.classify(
+ new Path(TABLE_ROOT +
"/dt=2024-01-01/bucket-0/index-a1b2c3d4-0")))
+ .isEqualTo(FileType.BUCKET_INDEX);
+ }
+
+ // ===== GLOBAL_INDEX files =====
+
+ @Test
+ public void testGlobalIndexFiles() {
+ // btree global index
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/index/btree-global-index-a1b2c3d4-e5f6.index")))
+ .isEqualTo(FileType.GLOBAL_INDEX);
+ // bitmap global index
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/index/bitmap-global-index-a1b2c3d4-e5f6.index")))
+ .isEqualTo(FileType.GLOBAL_INDEX);
+ // lumina vector global index
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/index/lumina-vector-ann-global-index-a1b2c3d4.index")))
+ .isEqualTo(FileType.GLOBAL_INDEX);
+ // tantivy fulltext global index
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/index/tantivy-fulltext-global-index-a1b2c3d4.index")))
+ .isEqualTo(FileType.GLOBAL_INDEX);
+ }
+
+ // ===== FILE_INDEX files =====
+
+ @Test
+ public void testFileIndexFiles() {
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc.index")))
+ .isEqualTo(FileType.FILE_INDEX);
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet.index")))
+ .isEqualTo(FileType.FILE_INDEX);
+ }
+
+ // ===== isIndex() =====
+
+ @Test
+ public void testIsIndex() {
+ assertThat(FileType.BUCKET_INDEX.isIndex()).isTrue();
+ assertThat(FileType.GLOBAL_INDEX.isIndex()).isTrue();
+ assertThat(FileType.FILE_INDEX.isIndex()).isTrue();
+ assertThat(FileType.META.isIndex()).isFalse();
+ assertThat(FileType.DATA.isIndex()).isFalse();
+ }
+
+ // ===== DATA files =====
+
+ @Test
+ public void testDataFiles() {
+ // orc data file
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc")))
+ .isEqualTo(FileType.DATA);
+ // parquet data file
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet")))
+ .isEqualTo(FileType.DATA);
+ // changelog data file
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc")))
+ .isEqualTo(FileType.DATA);
+ // blob file
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.blob")))
+ .isEqualTo(FileType.DATA);
+ // vector file
+ assertThat(
+ FileType.classify(
+ new Path(
+ TABLE_ROOT
+ +
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.vector.lance")))
+ .isEqualTo(FileType.DATA);
+ // unknown file defaults to DATA
+ assertThat(
+ FileType.classify(
+ new Path(TABLE_ROOT +
"/dt=2024-01-01/bucket-0/unknown-file.bin")))
+ .isEqualTo(FileType.DATA);
+ }
+
+ // ===== Edge cases =====
+
+ @Test
+ public void testChangelogDirInParentPathNotMisjudged() {
+ // table root path itself contains "changelog", should not be
misjudged as META
+ String tricky = "hdfs://cluster/changelog/warehouse/db.db/table";
+ assertThat(
+ FileType.classify(
+ new Path(tricky +
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc")))
+ .isEqualTo(FileType.DATA);
+ assertThat(
+ FileType.classify(
+ new Path(
+ tricky
+ +
"/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc")))
+ .isEqualTo(FileType.DATA);
+ }
+
+ @Test
+ public void testBranchPaths() {
+ String branchRoot = TABLE_ROOT + "/branch/branch-dev";
+ assertThat(FileType.classify(new Path(branchRoot +
"/snapshot/snapshot-1")))
+ .isEqualTo(FileType.META);
+ assertThat(FileType.classify(new Path(branchRoot +
"/schema/schema-0")))
+ .isEqualTo(FileType.META);
+ assertThat(FileType.classify(new Path(branchRoot +
"/changelog/changelog-1")))
+ .isEqualTo(FileType.META);
+ assertThat(FileType.classify(new Path(branchRoot +
"/index/index-a1b2c3d4-0")))
+ .isEqualTo(FileType.BUCKET_INDEX);
+ assertThat(
+ FileType.classify(
+ new Path(branchRoot +
"/index/btree-global-index-a1b2c3d4.index")))
+ .isEqualTo(FileType.GLOBAL_INDEX);
+ }
+}