[ 
https://issues.apache.org/jira/browse/HUDI-2781?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17451641#comment-17451641
 ] 

Yann Byron commented on HUDI-2781:
----------------------------------

[~xushiyan] 

Environment:

    spark 3.0.3, hive3.1.2, hudi release-0.10.0-rc2

Conclusions:
 * must provide `primaryKey` when create table, so should remove related 
content from [https://hudi.apache.org/docs/quick-start-guide];
 * ShowPartitions's result is wrong when delete/drop partitions;
 * Others is ok.

 

Details:

Create Table:
{code:java}

-- cow table, without partition fields and preCombineField
create table if not exists cow_nonpt_nonpcf_tbl (
  id int, 
  name string, 
  price double
) using hudi
options (
  type = 'cow',
  primaryKey = 'id'
);

-- mor table, without partition fields and preCombineField
create table if not exists mor_nonpt_nonpcf_tbl (
  id int, 
  name string, 
  price double
) using hudi
options (
  type = 'mor',
  primaryKey = 'id'
);

-- cow table, without partition fields, with preCombineField
create table if not exists cow_nonpt_pcf_tbl (
  id int, 
  name string, 
  price double,
  ts bigint
) using hudi
options (
  type = 'cow',
  primaryKey = 'id',
  preCombineField = 'ts'
);

-- mor table, without partition fields, with preCombineField 
create table if not exists mor_nonpt_pcf_tbl (
  id int, 
  name string, 
  price double,
  ts bigint
) using hudi
options (
  type = 'mor',
  primaryKey = 'id',
  preCombineField = 'ts'
);

-- cow table, with partition fields, without preCombineField 
create table if not exists cow_pt_nonpcf_tbl (
  id bigint,
  name string,
  dt string,
  hh string  
) using hudi
location 'file:///tmp/hudi/cow_pt_nonpcf_tbl'
tblproperties (
  primaryKey = 'id'
) 
partitioned by (dt, hh);

-- mor table, with partition fields, with preCombineField 
create table if not exists mor_pt_pcf_tbl (
  id bigint,
  name string,
  ts bigint,
  dt string,
  hh string  
) using hudi
location 'file:///tmp/hudi/mor_pt_pcf_tbl'
tblproperties (
  type = 'mor',
  primaryKey = 'id',
  preCombineField = 'ts'
 ) 
partitioned by (dt, hh); {code}
 

 

CTAS

 
{code:java}
-- partitioned table and use `options`
create table ctas_cow_pt_nonpcf_tbl using hudi
options (type = 'cow', primaryKey = 'id')
partitioned by (dt)
as
select 1 as id, 'a1' as name, 10 as price, 1000 as dt;

-- non-partitioned table and use `tblproperties` 
create table ctas_cow_nonpt_nonpcf_tbl using hudi
tblproperties (primaryKey = 'id')
as
select 1 as id, 'a1' as name, 10 as price; {code}
 

 

Create table based on existing path

 
{code:java}
create table existing_hudi_tbl using hudi 
options (
   primaryKey = 'id',
   preCombineField = 'ts'
)
partitioned by (dt)
location 'file:///tmp/hudi/dataframe_hudi_table'; {code}
 

 

Insert
{code:java}
-- normal insert into
insert into cow_nonpt_nonpcf_tbl select 1, 'a1', 20;
insert into mor_nonpt_nonpcf_tbl select 1, 'a1', 20;
insert into cow_nonpt_pcf_tbl select 1, 'a1', 20, 1000;
insert into mor_nonpt_pcf_tbl select 1, 'a1', 20, 1000;

-- insert static partition
insert into cow_pt_nonpcf_tbl partition(dt = '2021-01-02', hh='10') select 1, 
'a1';
insert into mor_pt_pcf_tbl partition(dt = '2021-01-02', hh='10') select 1, 
'a1', '1000';

-- insert dynamic partition
insert into cow_pt_nonpcf_tbl select 2, 'a2', '2021-01-02', '11';
insert into mor_pt_pcf_tbl select 2, 'a2', '1000', '2021-01-02', '11'; {code}
 

 

Insert overwrite
{code:java}
-- insert overwrite table
insert overwrite table cow_nonpt_nonpcf_tbl select 3, 'a3', 30;
insert overwrite table mor_nonpt_nonpcf_tbl select 3, 'a3', 30;
insert overwrite cow_nonpt_pcf_tbl select 3, 'a3', 30, 1000;
insert overwrite mor_nonpt_pcf_tbl select 3, 'a3', 30, 1000;

-- insert overwrite table with static partition
insert overwrite cow_pt_nonpcf_tbl partition(dt = '2021-01-02', hh='10') select 
1, 'a1_1';
insert overwrite mor_pt_pcf_tbl partition(dt = '2021-01-02', hh='10') select 1, 
'a1_1', '1100';

-- insert overwrite table with dynamic partition
insert overwrite table cow_pt_nonpcf_tbl select 2 as id, 'a2_2', '2021-01-02' 
as dt, '11' as hh;
insert overwrite table mor_pt_pcf_tbl select 2 as id, 'a2_2', '2200', 
'2021-01-02' as dt, '11' as hh; {code}
 

Update
 * if no preCombineField provided, can not use `update` syntax.

{code:java}
update cow_nonpt_pcf_tbl set price = price * 2, name = 'a3_3', ts = 3000 where 
id = 3;
update mor_nonpt_pcf_tbl set price = price * 2, name = 'a3_3', ts = 3000 where 
id = 3;

update mor_pt_pcf_tbl set name = 'aa_2', ts = 2222 where id % 2 = 0; {code}
 

Merge into
{code:java}
-- source table using delta for merging into non-partitioned table
create table merge_source (id int, name string, price double, ts bigint) using 
hudi
tblproperties (primaryKey = 'id', preCombineField = 'ts');
insert into merge_source values (1, "new_a1", 22.22, 4001), (2, "new_a2", 
33.33, 4001), (3, "new_a3", 44.44, 4001);

merge into cow_nonpt_nonpcf_tbl as target
using merge_source as source
on target.id = source.id
when matched then update set *
when not matched then insert *
;
merge into mor_nonpt_pcf_tbl as target
using merge_source as source
on target.id = source.id
when matched then update set *
when not matched then insert *
;

-- source table using parquet for merging into partitioned table 
create table merge_source2 (id int, name string, flag string, dt string, hh 
string) using parquet;
insert into merge_source2 values (1, "new_a1", 'update', '2021-01-02', '10'), 
(2, "new_a2", 'delete', '2021-01-02', '11'), (3, "new_a3", 'insert', 
'2021-01-02', '12');

merge into cow_pt_nonpcf_tbl as target
using (
  select id, name, flag, dt, hh from merge_source2
) source
on target.id = source.id
when matched and flag != 'delete' then update set id = source.id, name = 
source.name
when matched and flag = 'delete' then delete
when not matched then insert (id, name, dt, hh) values(source.id, source.name, 
source.dt, source.hh)
;

merge into mor_pt_pcf_tbl as target
using (
  select id, name, '1000' as ts, flag, dt, hh from merge_source2
) source
on target.id = source.id
when matched and flag != 'delete' then update set *
when matched and flag = 'delete' then delete
when not matched then insert (id, name, ts, dt, hh) values(source.id, 
source.name, source.ts, source.dt, source.hh)
; {code}
Delete
{code:java}
delete from cow_nonpt_nonpcf_tbl where id = 1;
delete from mor_nonpt_pcf_tbl where id = 1; {code}
 

Alter
{code:java}
--rename to:
ALTER TABLE cow_nonpt_nonpcf_tbl RENAME TO cow_nonpt_nonpcf_tbl_2;

--add column:
ALTER TABLE cow_nonpt_nonpcf_tbl_2 add columns(ext0 string);

--change column:
ALTER TABLE cow_nonpt_nonpcf_tbl_2 change column id id bigint;

--show partition:
show partitions mor_pt_pcf_tbl;
show partitions cow_pt_nonpcf_tbl;


--drop partition:
alter table cow_pt_nonpcf_tbl drop partition (dt='2021-01-02', hh='10');

--set properties;
alter table mor_nonpt_pcf_tbl set tblproperties (hoodie.keep.max.commits = 
'10');
alter table mor_nonpt_pcf_tbl set serdeproperties (hoodie.keep.max.commits = 
'10'); {code}
 

> Test 0.10 RC for Spark 3.x
> --------------------------
>
>                 Key: HUDI-2781
>                 URL: https://issues.apache.org/jira/browse/HUDI-2781
>             Project: Apache Hudi
>          Issue Type: Test
>          Components: Spark Integration
>            Reporter: Raymond Xu
>            Assignee: Yann Byron
>            Priority: Blocker
>              Labels: pull-request-available, sev:high
>             Fix For: 0.10.0
>
>
> Combinations
>  # Spark 3.0 & 3.1.x against Hive 2
>  # Spark 3.0 & 3.1.X against Hive 3
>  # Spark 3.2 against Hive 2
>  
> Let's test a COW and MOR long running DAG across these environments and get a 
> report with bugs/issues
>  
> We have YAMLs here, that can be run across all different environments listed 
> here.
> [https://github.com/apache/hudi/tree/master/docker/demo/config/test-suite]



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

Reply via email to