[ 
https://issues.apache.org/jira/browse/HIVE-28050?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Krisztian Kasa updated HIVE-28050:
----------------------------------
    Description: 
To support incremental rebuild of materialized views which definition does not 
have aggregate in presence of delete operations in any of its source tables the 
records of the source tables need to be uniquely identified and joined with the 
records present in the view.

One possibility is to project ROW_IDs of each source table in the view 
definition but the writeId component is changing at delete.

Another way is to project columns of primary keys or unique keys but these 
constraints are not enforced in Hive.

Current implementation leads to data correctness issues:
{code}
set hive.support.concurrency=true;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;

create table cmv_basetable_n6 (a int, b varchar(256), c decimal(10,2), d int) 
stored as orc TBLPROPERTIES ('transactional'='true');

insert into cmv_basetable_n6 values
 (1, 'bob', 10.30, 2),
 (1, 'alfred', 10.30, 2),
 (2, 'bob', 3.14, 3),
 (2, 'bonnie', 172342.2, 3),
 (3, 'calvin', 978.76, 3),
 (3, 'charlie', 9.8, 1);

create table cmv_basetable_2_n3 (a int, b varchar(256), c decimal(10,2), d int) 
stored as orc TBLPROPERTIES ('transactional'='true');

insert into cmv_basetable_2_n3 values
 (1, 'alfred', 10.30, 2),
 (3, 'calvin', 978.76, 3);

CREATE MATERIALIZED VIEW cmv_mat_view_n6
  TBLPROPERTIES ('transactional'='true') AS
  SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c
  FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = 
cmv_basetable_2_n3.a)
  WHERE cmv_basetable_2_n3.c > 10.0;

delete from cmv_basetable_n6 where b = 'bob';

explain cbo
alter materialized view cmv_mat_view_n6 rebuild;
alter materialized view cmv_mat_view_n6 rebuild;

select * from cmv_mat_view_n6;
{code}

{code}
3       978.76
3       978.76
{code}

but it should be
{code}
1       10.30
3       978.76
3       978.76
{code}

  was:
To support incremental rebuild of materialized views which definition does not 
have aggregate in presence of delete operations in any of its source tables the 
records of the source tables need to be uniquely identified and joined with the 
records present in the view.

One possibility is to project ROW_IDs of each source table in the view 
definition but the writeId component is changing at delete.

Another way is to project columns of primary keys or unique keys but these 
constraints are not enforced in Hive.


> Disable Incremental non aggregated materialized view rebuild in presence of 
> delete operations
> ---------------------------------------------------------------------------------------------
>
>                 Key: HIVE-28050
>                 URL: https://issues.apache.org/jira/browse/HIVE-28050
>             Project: Hive
>          Issue Type: Bug
>          Components: Materialized views
>            Reporter: Krisztian Kasa
>            Assignee: Krisztian Kasa
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 4.1.0
>
>
> To support incremental rebuild of materialized views which definition does 
> not have aggregate in presence of delete operations in any of its source 
> tables the records of the source tables need to be uniquely identified and 
> joined with the records present in the view.
> One possibility is to project ROW_IDs of each source table in the view 
> definition but the writeId component is changing at delete.
> Another way is to project columns of primary keys or unique keys but these 
> constraints are not enforced in Hive.
> Current implementation leads to data correctness issues:
> {code}
> set hive.support.concurrency=true;
> set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
> create table cmv_basetable_n6 (a int, b varchar(256), c decimal(10,2), d int) 
> stored as orc TBLPROPERTIES ('transactional'='true');
> insert into cmv_basetable_n6 values
>  (1, 'bob', 10.30, 2),
>  (1, 'alfred', 10.30, 2),
>  (2, 'bob', 3.14, 3),
>  (2, 'bonnie', 172342.2, 3),
>  (3, 'calvin', 978.76, 3),
>  (3, 'charlie', 9.8, 1);
> create table cmv_basetable_2_n3 (a int, b varchar(256), c decimal(10,2), d 
> int) stored as orc TBLPROPERTIES ('transactional'='true');
> insert into cmv_basetable_2_n3 values
>  (1, 'alfred', 10.30, 2),
>  (3, 'calvin', 978.76, 3);
> CREATE MATERIALIZED VIEW cmv_mat_view_n6
>   TBLPROPERTIES ('transactional'='true') AS
>   SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c
>   FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = 
> cmv_basetable_2_n3.a)
>   WHERE cmv_basetable_2_n3.c > 10.0;
> delete from cmv_basetable_n6 where b = 'bob';
> explain cbo
> alter materialized view cmv_mat_view_n6 rebuild;
> alter materialized view cmv_mat_view_n6 rebuild;
> select * from cmv_mat_view_n6;
> {code}
> {code}
> 3     978.76
> 3     978.76
> {code}
> but it should be
> {code}
> 1     10.30
> 3     978.76
> 3     978.76
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to