This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new f424d261b7 [core] Add FileType enum for classifying Paimon files 
(#7613)
f424d261b7 is described below

commit f424d261b76753c44af0d9ed35a9342604980d08
Author: LsomeYeah <[email protected]>
AuthorDate: Fri Apr 10 13:08:50 2026 +0800

    [core] Add FileType enum for classifying Paimon files (#7613)
---
 .../apache/paimon/consumer/ConsumerManager.java    |   2 +-
 .../org/apache/paimon/schema/SchemaManager.java    |   2 +-
 .../java/org/apache/paimon/utils/FileType.java     | 111 ++++++++++
 .../java/org/apache/paimon/utils/TagManager.java   |   2 +-
 .../java/org/apache/paimon/utils/FileTypeTest.java | 243 +++++++++++++++++++++
 5 files changed, 357 insertions(+), 3 deletions(-)

diff --git 
a/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java 
b/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java
index 8a2ad90ff7..c6ed06f5fe 100644
--- a/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java
+++ b/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java
@@ -45,7 +45,7 @@ public class ConsumerManager implements Serializable {
 
     private static final long serialVersionUID = 1L;
 
-    private static final String CONSUMER_PREFIX = "consumer-";
+    public static final String CONSUMER_PREFIX = "consumer-";
 
     private final FileIO fileIO;
     private final Path tablePath;
diff --git 
a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java 
b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java
index 8667f2271d..549c0e96a8 100644
--- a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java
+++ b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java
@@ -109,7 +109,7 @@ import static 
org.apache.paimon.utils.Preconditions.checkState;
 @ThreadSafe
 public class SchemaManager implements Serializable {
 
-    private static final String SCHEMA_PREFIX = "schema-";
+    public static final String SCHEMA_PREFIX = "schema-";
 
     private final FileIO fileIO;
     private final Path tableRoot;
diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/FileType.java 
b/paimon-core/src/main/java/org/apache/paimon/utils/FileType.java
new file mode 100644
index 0000000000..963780cbfd
--- /dev/null
+++ b/paimon-core/src/main/java/org/apache/paimon/utils/FileType.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.utils;
+
+import org.apache.paimon.consumer.ConsumerManager;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.io.DataFilePathFactory;
+import org.apache.paimon.schema.SchemaManager;
+import org.apache.paimon.service.ServiceManager;
+
+/**
+ * Classification of Paimon files.
+ *
+ * <ul>
+ *   <li>{@link #META}: snapshot, schema, manifest, statistics, tag, changelog 
metadata, hint files,
+ *       _SUCCESS, consumer, service files
+ *   <li>{@link #DATA}: data files and any unrecognized files (default)
+ *   <li>{@link #BUCKET_INDEX}: bucket level index files (Hash, DV)
+ *   <li>{@link #GLOBAL_INDEX}: table level global index files (btree, bitmap, 
lumina, tantivy)
+ *   <li>{@link #FILE_INDEX}: data-file index files (bloom filter, bitmap, 
etc.)
+ * </ul>
+ */
+public enum FileType {
+    META,
+    DATA,
+    BUCKET_INDEX,
+    GLOBAL_INDEX,
+    FILE_INDEX;
+
+    private static final String MANIFEST = "manifest";
+    private static final String CHANGELOG_DIR = "changelog";
+    private static final String GLOBAL_INDEX_INFIX = "global-index-";
+
+    /** Returns {@code true} if this file type is any kind of index. */
+    public boolean isIndex() {
+        return this == BUCKET_INDEX || this == GLOBAL_INDEX || this == 
FILE_INDEX;
+    }
+
+    /**
+     * Classify a file based on its full path.
+     *
+     * <p>When the file does not match any known pattern, it defaults to 
{@link #DATA}.
+     */
+    public static FileType classify(Path filePath) {
+        String name = filePath.getName();
+
+        // meta file prefixes: snapshot-, schema-, stat-, tag-, consumer-, 
service-
+        if (name.startsWith(SnapshotManager.SNAPSHOT_PREFIX)
+                || name.startsWith(SchemaManager.SCHEMA_PREFIX)
+                || name.startsWith(FileStorePathFactory.STATISTICS_PREFIX)
+                || name.startsWith(TagManager.TAG_PREFIX)
+                || name.startsWith(ConsumerManager.CONSUMER_PREFIX)
+                || name.startsWith(ServiceManager.SERVICE_PREFIX)) {
+            return META;
+        }
+
+        // file index: {data-file}.index (e.g. data-xxx.orc.index)
+        // must check before global index since global index also ends with 
".index"
+        if (name.endsWith(DataFilePathFactory.INDEX_PATH_SUFFIX)) {
+            if (name.contains(GLOBAL_INDEX_INFIX)) {
+                return GLOBAL_INDEX;
+            }
+            return FILE_INDEX;
+        }
+
+        // manifest, manifest-list, index-manifest: name contains "manifest"
+        if (name.contains(MANIFEST)) {
+            return META;
+        }
+
+        // bucket index: name starts with "index-" (e.g. index-{uuid}-{N})
+        if (name.startsWith(FileStorePathFactory.INDEX_PREFIX)) {
+            return BUCKET_INDEX;
+        }
+
+        // hint files
+        if ("EARLIEST".equals(name) || "LATEST".equals(name)) {
+            return META;
+        }
+
+        // success files
+        if ("_SUCCESS".equals(name) || name.endsWith("_SUCCESS")) {
+            return META;
+        }
+
+        // changelog metadata: parent dir is "changelog" and name starts with 
"changelog-"
+        if (name.startsWith(ChangelogManager.CHANGELOG_PREFIX)
+                && CHANGELOG_DIR.equals(filePath.getParent().getName())) {
+            return META;
+        }
+
+        // default: DATA
+        return DATA;
+    }
+}
diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java 
b/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java
index 703b93e145..4a0a18e51e 100644
--- a/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java
+++ b/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java
@@ -62,7 +62,7 @@ public class TagManager {
 
     private static final Logger LOG = 
LoggerFactory.getLogger(TagManager.class);
 
-    private static final String TAG_PREFIX = "tag-";
+    public static final String TAG_PREFIX = "tag-";
 
     private final FileIO fileIO;
     private final Path tablePath;
diff --git 
a/paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java 
b/paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java
new file mode 100644
index 0000000000..423fe3ba5f
--- /dev/null
+++ b/paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.utils;
+
+import org.apache.paimon.fs.Path;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link FileType}. */
+public class FileTypeTest {
+
+    private static final String TABLE_ROOT = 
"hdfs://cluster/warehouse/db.db/table";
+
+    // ===== META files =====
+
+    @Test
+    public void testMetaFiles() {
+        // snapshot
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/snapshot/snapshot-1")))
+                .isEqualTo(FileType.META);
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/snapshot/snapshot-100")))
+                .isEqualTo(FileType.META);
+        // schema
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/schema/schema-0")))
+                .isEqualTo(FileType.META);
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/schema/schema-5")))
+                .isEqualTo(FileType.META);
+        // manifest
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/manifest/manifest-a1b2c3d4-0")))
+                .isEqualTo(FileType.META);
+        // manifest-list
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/manifest/manifest-list-a1b2c3d4-0")))
+                .isEqualTo(FileType.META);
+        // index-manifest
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/manifest/index-manifest-a1b2c3d4-0")))
+                .isEqualTo(FileType.META);
+        // statistics
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/statistics/stat-a1b2c3d4-0")))
+                .isEqualTo(FileType.META);
+        // tag
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/tag/tag-2024-01-01")))
+                .isEqualTo(FileType.META);
+        assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-myTag")))
+                .isEqualTo(FileType.META);
+        // changelog metadata
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/changelog/changelog-1")))
+                .isEqualTo(FileType.META);
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/changelog/changelog-100")))
+                .isEqualTo(FileType.META);
+        // hint files
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/snapshot/EARLIEST")))
+                .isEqualTo(FileType.META);
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/snapshot/LATEST")))
+                .isEqualTo(FileType.META);
+        // success files
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/dt=2024-01-01/bucket-0/_SUCCESS")))
+                .isEqualTo(FileType.META);
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/tag/tag-success-file/myTag_SUCCESS")))
+                .isEqualTo(FileType.META);
+        // consumer
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/consumer/consumer-myGroup")))
+                .isEqualTo(FileType.META);
+        // service
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/service/service-primary-key-lookup")))
+                .isEqualTo(FileType.META);
+    }
+
+    // ===== BUCKET_INDEX files =====
+
+    @Test
+    public void testBucketIndexFiles() {
+        // under /index/ dir
+        assertThat(FileType.classify(new Path(TABLE_ROOT + 
"/index/index-a1b2c3d4-0")))
+                .isEqualTo(FileType.BUCKET_INDEX);
+        // under bucket dir
+        assertThat(
+                        FileType.classify(
+                                new Path(TABLE_ROOT + 
"/dt=2024-01-01/bucket-0/index-a1b2c3d4-0")))
+                .isEqualTo(FileType.BUCKET_INDEX);
+    }
+
+    // ===== GLOBAL_INDEX files =====
+
+    @Test
+    public void testGlobalIndexFiles() {
+        // btree global index
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/index/btree-global-index-a1b2c3d4-e5f6.index")))
+                .isEqualTo(FileType.GLOBAL_INDEX);
+        // bitmap global index
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/index/bitmap-global-index-a1b2c3d4-e5f6.index")))
+                .isEqualTo(FileType.GLOBAL_INDEX);
+        // lumina vector global index
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/index/lumina-vector-ann-global-index-a1b2c3d4.index")))
+                .isEqualTo(FileType.GLOBAL_INDEX);
+        // tantivy fulltext global index
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/index/tantivy-fulltext-global-index-a1b2c3d4.index")))
+                .isEqualTo(FileType.GLOBAL_INDEX);
+    }
+
+    // ===== FILE_INDEX files =====
+
+    @Test
+    public void testFileIndexFiles() {
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc.index")))
+                .isEqualTo(FileType.FILE_INDEX);
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet.index")))
+                .isEqualTo(FileType.FILE_INDEX);
+    }
+
+    // ===== isIndex() =====
+
+    @Test
+    public void testIsIndex() {
+        assertThat(FileType.BUCKET_INDEX.isIndex()).isTrue();
+        assertThat(FileType.GLOBAL_INDEX.isIndex()).isTrue();
+        assertThat(FileType.FILE_INDEX.isIndex()).isTrue();
+        assertThat(FileType.META.isIndex()).isFalse();
+        assertThat(FileType.DATA.isIndex()).isFalse();
+    }
+
+    // ===== DATA files =====
+
+    @Test
+    public void testDataFiles() {
+        // orc data file
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc")))
+                .isEqualTo(FileType.DATA);
+        // parquet data file
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet")))
+                .isEqualTo(FileType.DATA);
+        // changelog data file
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc")))
+                .isEqualTo(FileType.DATA);
+        // blob file
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.blob")))
+                .isEqualTo(FileType.DATA);
+        // vector file
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        TABLE_ROOT
+                                                + 
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.vector.lance")))
+                .isEqualTo(FileType.DATA);
+        // unknown file defaults to DATA
+        assertThat(
+                        FileType.classify(
+                                new Path(TABLE_ROOT + 
"/dt=2024-01-01/bucket-0/unknown-file.bin")))
+                .isEqualTo(FileType.DATA);
+    }
+
+    // ===== Edge cases =====
+
+    @Test
+    public void testChangelogDirInParentPathNotMisjudged() {
+        // table root path itself contains "changelog", should not be 
misjudged as META
+        String tricky = "hdfs://cluster/changelog/warehouse/db.db/table";
+        assertThat(
+                        FileType.classify(
+                                new Path(tricky + 
"/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc")))
+                .isEqualTo(FileType.DATA);
+        assertThat(
+                        FileType.classify(
+                                new Path(
+                                        tricky
+                                                + 
"/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc")))
+                .isEqualTo(FileType.DATA);
+    }
+
+    @Test
+    public void testBranchPaths() {
+        String branchRoot = TABLE_ROOT + "/branch/branch-dev";
+        assertThat(FileType.classify(new Path(branchRoot + 
"/snapshot/snapshot-1")))
+                .isEqualTo(FileType.META);
+        assertThat(FileType.classify(new Path(branchRoot + 
"/schema/schema-0")))
+                .isEqualTo(FileType.META);
+        assertThat(FileType.classify(new Path(branchRoot + 
"/changelog/changelog-1")))
+                .isEqualTo(FileType.META);
+        assertThat(FileType.classify(new Path(branchRoot + 
"/index/index-a1b2c3d4-0")))
+                .isEqualTo(FileType.BUCKET_INDEX);
+        assertThat(
+                        FileType.classify(
+                                new Path(branchRoot + 
"/index/btree-global-index-a1b2c3d4.index")))
+                .isEqualTo(FileType.GLOBAL_INDEX);
+    }
+}

Reply via email to