This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new f3c415dfaec [fix](hudi) support reading hudi read optimized table with orc format (#44995) f3c415dfaec is described below commit f3c415dfaec698a443b657229a56be06923ed522 Author: Socrates <suyit...@selectdb.com> AuthorDate: Thu Dec 5 01:05:54 2024 +0800 [fix](hudi) support reading hudi read optimized table with orc format (#44995) ### What problem does this PR solve? Problem Summary: When reading the hudi ro table, it will be pushed back from jni to the native reader. However, this process will default the file format to parquet, and does not consider the situation that the hudi table is stored in orc format. 1. support reading hudi read optimized table with orc format 2. fix explain results of hudiScanNode when force_jni_reader=true 3. add cases about timestamp with different timezones --- .../doris/datasource/hudi/source/HudiScanNode.java | 14 +++++++-- .../hudi/test_hudi_orc_tables.out | 15 ++++++++++ .../external_table_p2/hudi/test_hudi_timestamp.out | 31 ++++++++++++++++++-- ...imestamp.groovy => test_hudi_orc_tables.groovy} | 33 ++++------------------ .../hudi/test_hudi_timestamp.groovy | 18 ++++++++++-- 5 files changed, 76 insertions(+), 35 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index 28805aae63c..a73a2065d0f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -25,6 +25,7 @@ import org.apache.doris.catalog.PartitionItem; import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.UserException; +import org.apache.doris.common.util.FileFormatUtils; import org.apache.doris.common.util.LocationPath; import org.apache.doris.datasource.ExternalTable; import org.apache.doris.datasource.FileSplit; @@ -247,8 +248,15 @@ public class HudiScanNode extends HiveScanNode { && !sessionVariable.isForceJniScanner() && hudiSplit.getHudiDeltaLogs().isEmpty()) { // no logs, is read optimize table, fallback to use native reader - // TODO: support read orc hudi table in native reader - rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); + String fileFormat = FileFormatUtils.getFileFormatBySuffix(hudiSplit.getDataFilePath()) + .orElse("Unknown"); + if (fileFormat.equals("parquet")) { + rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); + } else if (fileFormat.equals("orc")) { + rangeDesc.setFormatType(TFileFormatType.FORMAT_ORC); + } else { + throw new RuntimeException("Unsupported file format: " + fileFormat); + } } setHudiParams(rangeDesc, hudiSplit); } @@ -495,7 +503,7 @@ public class HudiScanNode extends HiveScanNode { List<String> logs = fileSlice.getLogFiles().map(HoodieLogFile::getPath) .map(StoragePath::toString) .collect(Collectors.toList()); - if (logs.isEmpty()) { + if (logs.isEmpty() && !sessionVariable.isForceJniScanner()) { noLogsSplitNum.incrementAndGet(); } diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out b/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out new file mode 100644 index 00000000000..9e28074dc91 --- /dev/null +++ b/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cow -- +20241204190011744 20241204190011744_0_6 20241204190011744_0_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 1 A +20241204190011744 20241204190011744_0_7 20241204190011744_2_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 3 C +20241204190011744 20241204190011744_0_8 20241204190011744_4_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 5 E +20241204190011744 20241204190011744_0_9 20241204190011744_1_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 2 B +20241204190011744 20241204190011744_0_10 20241204190011744_3_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 4 D + +-- !mor -- +20241204190002046 20241204190002046_0_11 20241204190002046_0_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 1 A +20241204190002046 20241204190002046_0_12 20241204190002046_2_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 3 C +20241204190002046 20241204190002046_0_13 20241204190002046_4_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 5 E +20241204190002046 20241204190002046_0_14 20241204190002046_1_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 2 B +20241204190002046 20241204190002046_0_15 20241204190002046_3_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 4 D + diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out b/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out index dc47ff86d90..9bdb0f7cb72 100644 --- a/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out +++ b/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out @@ -1,6 +1,31 @@ -- This file is automatically generated. You should know what you did if you want to edit this --- !timestamp -- +-- !timestamp1 -- 20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T08:00 -20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T09:30:00 -20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T11:00:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T09:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T11:00 + +-- !timestamp2 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T23:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-26T00:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-26T02:00 + +-- !timestamp3 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T15:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T16:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T18:00 + +-- !timestamp1 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T08:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T09:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T11:00 + +-- !timestamp2 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T23:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-26T00:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-26T02:00 + +-- !timestamp3 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T15:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T16:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T18:00 diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy similarity index 63% copy from regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy copy to regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy index 36309322558..43638a23881 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy @@ -15,13 +15,13 @@ // specific language governing permissions and limitations // under the License. -suite("test_hudi_timestamp", "p2,external,hudi,external_remote,external_remote_hudi") { +suite("test_hudi_orc_tables", "p2,external,hudi,external_remote,external_remote_hudi") { String enabled = context.config.otherConfigs.get("enableExternalHudiTest") if (enabled == null || !enabled.equalsIgnoreCase("true")) { logger.info("disable hudi test") } - String catalog_name = "test_hudi_timestamp" + String catalog_name = "test_hudi_orc_tables" String props = context.config.otherConfigs.get("hudiEmrCatalog") sql """drop catalog if exists ${catalog_name};""" sql """ @@ -33,30 +33,9 @@ suite("test_hudi_timestamp", "p2,external,hudi,external_remote,external_remote_h sql """ switch ${catalog_name};""" sql """ use regression_hudi;""" sql """ set enable_fallback_to_original_planner=false """ - - // TODO: fix hudi timezone issue and enable this - // qt_timestamp """ select * from hudi_table_with_timestamp order by id; """ + + qt_cow """ select * from orc_hudi_table_cow; """ + qt_mor """ select * from orc_hudi_table_mor; """ sql """drop catalog if exists ${catalog_name};""" -} - -// DROP TABLE IF EXISTS hudi_table_with_timestamp; - -// -- create table -// CREATE TABLE hudi_table_with_timestamp ( -// id STRING, -// name STRING, -// event_time TIMESTAMP -// ) USING HUDI -// OPTIONS ( -// type = 'cow', -// primaryKey = 'id', -// preCombineField = 'event_time' -// ); - -// SET TIME ZONE 'America/Los_Angeles'; - -// INSERT OVERWRITE hudi_table_with_timestamp VALUES -// ('1', 'Alice', timestamp('2024-10-25 08:00:00')), -// ('2', 'Bob', timestamp('2024-10-25 09:30:00')), -// ('3', 'Charlie', timestamp('2024-10-25 11:00:00')); +} \ No newline at end of file diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy index 36309322558..3d7bd40b2d5 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy @@ -34,8 +34,22 @@ suite("test_hudi_timestamp", "p2,external,hudi,external_remote,external_remote_h sql """ use regression_hudi;""" sql """ set enable_fallback_to_original_planner=false """ - // TODO: fix hudi timezone issue and enable this - // qt_timestamp """ select * from hudi_table_with_timestamp order by id; """ + def test_timestamp_different_timezones = { + sql """set time_zone = 'America/Los_Angeles';""" + qt_timestamp1 """ select * from hudi_table_with_timestamp order by id; """ + sql """set time_zone = 'Asia/Shanghai';""" + qt_timestamp2 """ select * from hudi_table_with_timestamp order by id; """ + sql """set time_zone = 'UTC';""" + qt_timestamp3 """ select * from hudi_table_with_timestamp order by id; """ + } + + // test native reader + test_timestamp_different_timezones() + sql """ set force_jni_scanner = true; """ + // test jni reader + test_timestamp_different_timezones() + sql """ set force_jni_scanner = false; """ + sql """drop catalog if exists ${catalog_name};""" } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org