bithw1 opened a new issue, #12011:
URL: https://github.com/apache/hudi/issues/12011
Hi,
I am using Hudi 0.15.0, I find that the
option`hoodie.datasource.hive_sync.skip_ro_suffix` looks not working as
expected.
I first created an mor table using spark-sql cli(version: spark-3.3.2), with
`hoodie.datasource.hive_sync.skip_ro_suffix=true`, then I look into the create
table definition from the hive cli
```
CREATE TABLE IF NOT EXISTS hudi_mor_17 (
a INT,
b INT,
c INT
)
USING hudi
tblproperties(
type='mor',
primaryKey='a',
preCombineField='c',
hoodie.datasource.hive_sync.skip_ro_suffix='true'
)
insert into hudi_mor_17 select 1,1,1;
insert into hudi_mor_17 select 2,2,2;
insert into hudi_mor_17 select 3,3,3;
insert into hudi_mor_17 select 4,4,4;
insert into hudi_mor_17 select 5,5,5;
update hudi_mor_17 set b = 11 where a = 1;
update hudi_mor_17 set b = 22 where a = 2;
```
when the table is created and some data insert/update in, I looked into the
table definition.
```
CREATE TABLE `hudi_mor_17`(
`_hoodie_commit_time` string,
`_hoodie_commit_seqno` string,
`_hoodie_record_key` string,
`_hoodie_partition_path` string,
`_hoodie_file_name` string,
`a` int,
`b` int,
`c` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
WITH SERDEPROPERTIES (
'hoodie.query.as.ro.table'='false',
'path'='hdfs://hadoop.master:9000/user/hive/warehouse/hudi_mor_17')
STORED AS INPUTFORMAT
'org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://hadoop.master:9000/user/hive/warehouse/hudi_mor_17'
TBLPROPERTIES (
'hoodie.datasource.hive_sync.skip_ro_suffix'='true',
'last_commit_completion_time_sync'='20240926132024516',
'last_commit_time_sync'='20240926132023403',
'preCombineField'='c',
'primaryKey'='a',
'provider'='hudi',
'spark.sql.create.version'='3.3.2',
'spark.sql.sources.provider'='hudi',
'spark.sql.sources.schema.numParts'='1',
'spark.sql.sources.schema.part.0'='{"type":"struct","fields":[{"name":"_hoodie_commit_time","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_commit_seqno","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_record_key","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_partition_path","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_file_name","type":"string","nullable":true,"metadata":{}},{"name":"a","type":"integer","nullable":true,"metadata":{}},{"name":"b","type":"integer","nullable":true,"metadata":{}},{"name":"c","type":"integer","nullable":true,"metadata":{}}]}',
'transient_lastDdlTime'='1727327985',
'type'='mor')
```
and
```
CREATE EXTERNAL TABLE `hudi_mor_17_rt`(
`_hoodie_commit_time` string COMMENT '',
`_hoodie_commit_seqno` string COMMENT '',
`_hoodie_record_key` string COMMENT '',
`_hoodie_partition_path` string COMMENT '',
`_hoodie_file_name` string COMMENT '',
`a` int COMMENT '',
`b` int COMMENT '',
`c` int COMMENT '')
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
WITH SERDEPROPERTIES (
'hoodie.query.as.ro.table'='false',
'path'='hdfs://hadoop.master:9000/user/hive/warehouse/hudi_mor_17')
STORED AS INPUTFORMAT
'org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://hadoop.master:9000/user/hive/warehouse/hudi_mor_17'
TBLPROPERTIES (
'last_commit_completion_time_sync'='20240926132024516',
'last_commit_time_sync'='20240926132023403',
'spark.sql.create.version'='3.3.2',
'spark.sql.sources.provider'='hudi',
'spark.sql.sources.schema.numParts'='1',
'spark.sql.sources.schema.part.0'='{"type":"struct","fields":[{"name":"_hoodie_commit_time","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_commit_seqno","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_record_key","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_partition_path","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_file_name","type":"string","nullable":true,"metadata":{}},{"name":"a","type":"integer","nullable":true,"metadata":{}},{"name":"b","type":"integer","nullable":true,"metadata":{}},{"name":"c","type":"integer","nullable":true,"metadata":{}}]}',
'transient_lastDdlTime'='1727328010')
```
It is expected that `hudi_mor_17_ro` is not created, but the two created
tables have same `'hoodie.query.as.ro.table'='false'` and
`STORED AS INPUTFORMAT
'org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat'`,
I query the two tables and am able to query out the updated record,so
neither of these two tables are ro tables, I would ask what
option`hoodie.datasource.hive_sync.skip_ro_suffix` does, it looks that ro table
has bee lost...
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]