This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new f3c415dfaec [fix](hudi) support reading hudi read optimized table with 
orc format (#44995)
f3c415dfaec is described below

commit f3c415dfaec698a443b657229a56be06923ed522
Author: Socrates <suyit...@selectdb.com>
AuthorDate: Thu Dec 5 01:05:54 2024 +0800

    [fix](hudi) support reading hudi read optimized table with orc format 
(#44995)
    
    ### What problem does this PR solve?
    Problem Summary:
    When reading the hudi ro table, it will be pushed back from jni to the
    native reader. However, this process will default the file format to
    parquet, and does not consider the situation that the hudi table is
    stored in orc format.
    
    1. support reading hudi read optimized table with orc format
    2. fix explain results of hudiScanNode when force_jni_reader=true
    3. add cases about  timestamp with different timezones
---
 .../doris/datasource/hudi/source/HudiScanNode.java | 14 +++++++--
 .../hudi/test_hudi_orc_tables.out                  | 15 ++++++++++
 .../external_table_p2/hudi/test_hudi_timestamp.out | 31 ++++++++++++++++++--
 ...imestamp.groovy => test_hudi_orc_tables.groovy} | 33 ++++------------------
 .../hudi/test_hudi_timestamp.groovy                | 18 ++++++++++--
 5 files changed, 76 insertions(+), 35 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java
index 28805aae63c..a73a2065d0f 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java
@@ -25,6 +25,7 @@ import org.apache.doris.catalog.PartitionItem;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.common.AnalysisException;
 import org.apache.doris.common.UserException;
+import org.apache.doris.common.util.FileFormatUtils;
 import org.apache.doris.common.util.LocationPath;
 import org.apache.doris.datasource.ExternalTable;
 import org.apache.doris.datasource.FileSplit;
@@ -247,8 +248,15 @@ public class HudiScanNode extends HiveScanNode {
                     && !sessionVariable.isForceJniScanner()
                     && hudiSplit.getHudiDeltaLogs().isEmpty()) {
                 // no logs, is read optimize table, fallback to use native 
reader
-                // TODO: support read orc hudi table in native reader
-                rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET);
+                String fileFormat = 
FileFormatUtils.getFileFormatBySuffix(hudiSplit.getDataFilePath())
+                        .orElse("Unknown");
+                if (fileFormat.equals("parquet")) {
+                    rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET);
+                } else if (fileFormat.equals("orc")) {
+                    rangeDesc.setFormatType(TFileFormatType.FORMAT_ORC);
+                } else {
+                    throw new RuntimeException("Unsupported file format: " + 
fileFormat);
+                }
             }
             setHudiParams(rangeDesc, hudiSplit);
         }
@@ -495,7 +503,7 @@ public class HudiScanNode extends HiveScanNode {
         List<String> logs = fileSlice.getLogFiles().map(HoodieLogFile::getPath)
                 .map(StoragePath::toString)
                 .collect(Collectors.toList());
-        if (logs.isEmpty()) {
+        if (logs.isEmpty() && !sessionVariable.isForceJniScanner()) {
             noLogsSplitNum.incrementAndGet();
         }
 
diff --git 
a/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out 
b/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out
new file mode 100644
index 00000000000..9e28074dc91
--- /dev/null
+++ b/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out
@@ -0,0 +1,15 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !cow --
+20241204190011744      20241204190011744_0_6   20241204190011744_0_0           
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc    1       
A
+20241204190011744      20241204190011744_0_7   20241204190011744_2_0           
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc    3       
C
+20241204190011744      20241204190011744_0_8   20241204190011744_4_0           
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc    5       
E
+20241204190011744      20241204190011744_0_9   20241204190011744_1_0           
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc    2       
B
+20241204190011744      20241204190011744_0_10  20241204190011744_3_0           
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc    4       
D
+
+-- !mor --
+20241204190002046      20241204190002046_0_11  20241204190002046_0_0           
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc    1       
A
+20241204190002046      20241204190002046_0_12  20241204190002046_2_0           
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc    3       
C
+20241204190002046      20241204190002046_0_13  20241204190002046_4_0           
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc    5       
E
+20241204190002046      20241204190002046_0_14  20241204190002046_1_0           
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc    2       
B
+20241204190002046      20241204190002046_0_15  20241204190002046_3_0           
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc    4       
D
+
diff --git 
a/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out 
b/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out
index dc47ff86d90..9bdb0f7cb72 100644
--- a/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out
+++ b/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out
@@ -1,6 +1,31 @@
 -- This file is automatically generated. You should know what you did if you 
want to edit this
--- !timestamp --
+-- !timestamp1 --
 20241115015956800      20241115015956800_0_2   1               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1       
Alice   2024-10-25T08:00
-20241115015956800      20241115015956800_0_0   2               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2       
Bob     2024-10-25T09:30:00
-20241115015956800      20241115015956800_0_1   3               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3       
Charlie 2024-10-25T11:00:00
+20241115015956800      20241115015956800_0_0   2               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2       
Bob     2024-10-25T09:30
+20241115015956800      20241115015956800_0_1   3               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3       
Charlie 2024-10-25T11:00
+
+-- !timestamp2 --
+20241115015956800      20241115015956800_0_2   1               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1       
Alice   2024-10-25T23:00
+20241115015956800      20241115015956800_0_0   2               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2       
Bob     2024-10-26T00:30
+20241115015956800      20241115015956800_0_1   3               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3       
Charlie 2024-10-26T02:00
+
+-- !timestamp3 --
+20241115015956800      20241115015956800_0_2   1               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1       
Alice   2024-10-25T15:00
+20241115015956800      20241115015956800_0_0   2               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2       
Bob     2024-10-25T16:30
+20241115015956800      20241115015956800_0_1   3               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3       
Charlie 2024-10-25T18:00
+
+-- !timestamp1 --
+20241115015956800      20241115015956800_0_2   1               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1       
Alice   2024-10-25T08:00
+20241115015956800      20241115015956800_0_0   2               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2       
Bob     2024-10-25T09:30
+20241115015956800      20241115015956800_0_1   3               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3       
Charlie 2024-10-25T11:00
+
+-- !timestamp2 --
+20241115015956800      20241115015956800_0_2   1               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1       
Alice   2024-10-25T23:00
+20241115015956800      20241115015956800_0_0   2               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2       
Bob     2024-10-26T00:30
+20241115015956800      20241115015956800_0_1   3               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3       
Charlie 2024-10-26T02:00
+
+-- !timestamp3 --
+20241115015956800      20241115015956800_0_2   1               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1       
Alice   2024-10-25T15:00
+20241115015956800      20241115015956800_0_0   2               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2       
Bob     2024-10-25T16:30
+20241115015956800      20241115015956800_0_1   3               
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3       
Charlie 2024-10-25T18:00
 
diff --git 
a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy 
b/regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy
similarity index 63%
copy from 
regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
copy to 
regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy
index 36309322558..43638a23881 100644
--- a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
+++ b/regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-suite("test_hudi_timestamp", 
"p2,external,hudi,external_remote,external_remote_hudi") {
+suite("test_hudi_orc_tables", 
"p2,external,hudi,external_remote,external_remote_hudi") {
     String enabled = context.config.otherConfigs.get("enableExternalHudiTest")
     if (enabled == null || !enabled.equalsIgnoreCase("true")) {
         logger.info("disable hudi test")
     }
 
-    String catalog_name = "test_hudi_timestamp"
+    String catalog_name = "test_hudi_orc_tables"
     String props = context.config.otherConfigs.get("hudiEmrCatalog")
     sql """drop catalog if exists ${catalog_name};"""
     sql """
@@ -33,30 +33,9 @@ suite("test_hudi_timestamp", 
"p2,external,hudi,external_remote,external_remote_h
     sql """ switch ${catalog_name};"""
     sql """ use regression_hudi;""" 
     sql """ set enable_fallback_to_original_planner=false """
-
-    // TODO: fix hudi timezone issue and enable this
-    // qt_timestamp """ select * from hudi_table_with_timestamp order by id; 
"""
+    
+    qt_cow """ select * from  orc_hudi_table_cow; """
+    qt_mor """ select * from  orc_hudi_table_mor; """
 
     sql """drop catalog if exists ${catalog_name};"""
-}
-
-// DROP TABLE IF EXISTS hudi_table_with_timestamp;
-
-// -- create table
-// CREATE TABLE hudi_table_with_timestamp (
-//   id STRING,
-//   name STRING,
-//   event_time TIMESTAMP
-// ) USING HUDI
-// OPTIONS (
-//   type = 'cow',
-//   primaryKey = 'id',
-//   preCombineField = 'event_time'
-// );
-
-// SET TIME ZONE 'America/Los_Angeles';
-
-// INSERT OVERWRITE hudi_table_with_timestamp VALUES
-// ('1', 'Alice', timestamp('2024-10-25 08:00:00')),
-// ('2', 'Bob', timestamp('2024-10-25 09:30:00')),
-// ('3', 'Charlie', timestamp('2024-10-25 11:00:00'));
+}
\ No newline at end of file
diff --git 
a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy 
b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
index 36309322558..3d7bd40b2d5 100644
--- a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
+++ b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
@@ -34,8 +34,22 @@ suite("test_hudi_timestamp", 
"p2,external,hudi,external_remote,external_remote_h
     sql """ use regression_hudi;""" 
     sql """ set enable_fallback_to_original_planner=false """
 
-    // TODO: fix hudi timezone issue and enable this
-    // qt_timestamp """ select * from hudi_table_with_timestamp order by id; 
"""
+    def test_timestamp_different_timezones = {
+        sql """set time_zone = 'America/Los_Angeles';"""
+        qt_timestamp1 """ select * from hudi_table_with_timestamp order by id; 
"""
+        sql """set time_zone = 'Asia/Shanghai';"""
+        qt_timestamp2 """ select * from hudi_table_with_timestamp order by id; 
"""
+        sql """set time_zone = 'UTC';"""
+        qt_timestamp3 """ select * from hudi_table_with_timestamp order by id; 
"""
+    }
+
+    // test native reader
+    test_timestamp_different_timezones()
+    sql """ set force_jni_scanner = true; """
+    // test jni reader
+    test_timestamp_different_timezones()
+    sql """ set force_jni_scanner = false; """
+
 
     sql """drop catalog if exists ${catalog_name};"""
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to