[ https://issues.apache.org/jira/browse/HIVE-27924?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Wenhao Li updated HIVE-27924: ----------------------------- Description: h1. Summary The incremental rebuild plan and execution output are incorrect when one side of the table join has inserted/deleted join keys that the other side has deleted/inserted (note the order). The argument is that tuples that have never been present simultaneously should not interact with one another, i.e., one's inserts should not join the other's deletes. h1. Related Test Case The bug was discovered during replication of the test case: ??hive/ql/src/test/queries/clientpositive/materialized_view_create_rewrite_5.q?? h1. Steps to Reproduce the Issue # Configurations: {code:java} SET hive.vectorized.execution.enabled=false; set hive.support.concurrency=true; set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; set hive.strict.checks.cartesian.product=false; set hive.materializedview.rewriting=true;{code} # {code:java} create table cmv_basetable_n6 (a int, b varchar(256), c decimal(10,2), d int) stored as orc TBLPROPERTIES ('transactional'='true'); {code} # {code:java} insert into cmv_basetable_n6 values (1, 'alfred', 10.30, 2), (1, 'charlie', 20.30, 2); {code} # {code:java} create table cmv_basetable_2_n3 (a int, b varchar(256), c decimal(10,2), d int) stored as orc TBLPROPERTIES ('transactional'='true'); {code} # {code:java} insert into cmv_basetable_2_n3 values (1, 'bob', 30.30, 2), (1, 'bonnie', 40.30, 2);{code} # {code:java} CREATE MATERIALIZED VIEW cmv_mat_view_n6 TBLPROPERTIES ('transactional'='true') AS SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = cmv_basetable_2_n3.a) WHERE cmv_basetable_2_n3.c > 10.0;{code} # {code:java} show tables; {code} !截图.PNG! # Select tuples, including deletion and with VirtualColumn's, from the MV and source tables. We see that the MV is correctly built upon creation: {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_mat_view_n6('acid.fetch.deleted.rows'='true');{code} !截图1.PNG! # {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_basetable_n6('acid.fetch.deleted.rows'='true'); {code} !截图2.PNG! # {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_basetable_2_n3('acid.fetch.deleted.rows'='true'); {code} !截图3.PNG! # Now make an insert to the LHS and a delete to the RHS source table: {code:java} insert into cmv_basetable_n6 values (1, 'kevin', 50.30, 2); DELETE FROM cmv_basetable_2_n3 WHERE b = 'bonnie';{code} # Select again to see what happened: {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_basetable_n6('acid.fetch.deleted.rows'='true'); {code} !截图4.PNG! # {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_basetable_2_n3('acid.fetch.deleted.rows'='true'); {code} !截图5.PNG! # Use {{EXPLAIN CBO}} to produce the incremental rebuild plan for the MV, which is incorrect already: {code:java} EXPLAIN CBO ALTER MATERIALIZED VIEW cmv_mat_view_n6 REBUILD; {code} !截图6.PNG! # Rebuild MV and see (incorrect) results: {code:java} ALTER MATERIALIZED VIEW cmv_mat_view_n6 REBUILD; SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_mat_view_n6('acid.fetch.deleted.rows'='true');{code} !截图7.PNG! # Run MV definition directly, which outputs incorrect results because the MV is enabled for MV-based query rewrite, i.e., the following query will output what's in the MV for the time being: {code:java} SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = cmv_basetable_2_n3.a) WHERE cmv_basetable_2_n3.c > 10.0; {code} !截图8.PNG! # Disable MV-based query rewrite for the MV and re-run the definition, which should give the correct results: {code:java} ALTER MATERIALIZED VIEW cmv_mat_view_n6 DISABLE REWRITE; SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = cmv_basetable_2_n3.a) WHERE cmv_basetable_2_n3.c > 10.0;{code} !截图9.PNG! h1. Note This issue is also seen in update-incurred inserts/deletes. was: h1. Summary The incremental rebuild plan and execution output are incorrect when one side of the table join has inserted/deleted join keys that the other side has deleted/inserted (note the order). The argument is that tuples that have never been present simultaneously should not interact with one another, i.e., one's inserts should not join the other's deletes. h1. Related Test Case The bug was discovered during replication of the test case: ??hive/ql/src/test/queries/clientpositive/materialized_view_create_rewrite_5.q?? h1. Steps to Reproduce the Issue # Configurations: {code:java} SET hive.vectorized.execution.enabled=false; set hive.support.concurrency=true; set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; set hive.strict.checks.cartesian.product=false; set hive.materializedview.rewriting=true;{code} # {code:java} create table cmv_basetable_n6 (a int, b varchar(256), c decimal(10,2), d int) stored as orc TBLPROPERTIES ('transactional'='true'); {code} # {code:java} insert into cmv_basetable_n6 values (1, 'alfred', 10.30, 2), (1, 'charlie', 20.30, 2); {code} # {code:java} create table cmv_basetable_2_n3 (a int, b varchar(256), c decimal(10,2), d int) stored as orc TBLPROPERTIES ('transactional'='true'); {code} # {code:java} insert into cmv_basetable_2_n3 values (1, 'bob', 30.30, 2), (1, 'bonnie', 40.30, 2);{code} # {code:java} CREATE MATERIALIZED VIEW cmv_mat_view_n6 TBLPROPERTIES ('transactional'='true') AS SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = cmv_basetable_2_n3.a) WHERE cmv_basetable_2_n3.c > 10.0;{code} # {code:java} show tables; {code} !截图.PNG! # Select tuples, including deletion and with VirtualColumn's, from the MV and source tables. We see that the MV is correctly built upon creation: {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_mat_view_n6('acid.fetch.deleted.rows'='true');{code} !截图1.PNG! # {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_basetable_n6('acid.fetch.deleted.rows'='true'); {code} !截图2.PNG! # {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_basetable_2_n3('acid.fetch.deleted.rows'='true'); {code} !截图3.PNG! # Now make an insert to the LHS and a delete to the RHS source table: {code:java} insert into cmv_basetable_n6 values (1, 'kevin', 50.30, 2); DELETE FROM cmv_basetable_2_n3 WHERE b = 'bonnie';{code} # Select again to see what happened: {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_basetable_n6('acid.fetch.deleted.rows'='true'); {code} !截图4.PNG! # {code:java} SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_basetable_2_n3('acid.fetch.deleted.rows'='true'); {code} !截图5.PNG! # Use {{EXPLAIN CBO}} to produce the incremental rebuild plan for the MV, which is incorrect already: {code:java} EXPLAIN CBO ALTER MATERIALIZED VIEW cmv_mat_view_n6 REBUILD; {code} !截图6.PNG! # Rebuild MV and see (incorrect) results: {code:java} ALTER MATERIALIZED VIEW cmv_mat_view_n6 REBUILD; SELECT ROW__IS__DELETED, ROW__ID, * FROM cmv_mat_view_n6('acid.fetch.deleted.rows'='true');{code} !截图7.PNG! # Run MV definition directly, which outputs incorrect results because the MV is enabled for MV-based query rewrite, i.e., the following query will output what's in the MV for the time being: {code:java} SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = cmv_basetable_2_n3.a) WHERE cmv_basetable_2_n3.c > 10.0; {code} !截图8.PNG! # Disable MV-based query rewrite for the MV and re-run the definition, which should give the correct results: {code:java} ALTER MATERIALIZED VIEW cmv_mat_view_n6 DISABLE REWRITE; SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = cmv_basetable_2_n3.a) WHERE cmv_basetable_2_n3.c > 10.0;{code} !截图9.PNG! h1. Note This issue is also seen in update-incurred inserts/deletes. > Incremental rebuild goes wrong when inserts and deletes overlap between the > source tables > ----------------------------------------------------------------------------------------- > > Key: HIVE-27924 > URL: https://issues.apache.org/jira/browse/HIVE-27924 > Project: Hive > Issue Type: Bug > Components: Materialized views > Affects Versions: 4.0.0-beta-1 > Environment: * Docker version : 19.03.6 > * Hive version : 4.0.0-beta-1 > * Driver version : Hive JDBC (4.0.0-beta-1) > * Beeline version : 4.0.0-beta-1 > Reporter: Wenhao Li > Priority: Critical > Labels: bug, hive, materializedviews > Attachments: 截图.PNG, 截图1.PNG, 截图2.PNG, 截图3.PNG, 截图4.PNG, 截图5.PNG, > 截图6.PNG, 截图7.PNG, 截图8.PNG, 截图9.PNG > > > h1. Summary > The incremental rebuild plan and execution output are incorrect when one side > of the table join has inserted/deleted join keys that the other side has > deleted/inserted (note the order). > The argument is that tuples that have never been present simultaneously > should not interact with one another, i.e., one's inserts should not join the > other's deletes. > h1. Related Test Case > The bug was discovered during replication of the test case: > ??hive/ql/src/test/queries/clientpositive/materialized_view_create_rewrite_5.q?? > h1. Steps to Reproduce the Issue > # Configurations: > {code:java} > SET hive.vectorized.execution.enabled=false; > set hive.support.concurrency=true; > set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; > set hive.strict.checks.cartesian.product=false; > set hive.materializedview.rewriting=true;{code} > # > {code:java} > create table cmv_basetable_n6 (a int, b varchar(256), c decimal(10,2), d int) > stored as orc TBLPROPERTIES ('transactional'='true'); {code} > # > {code:java} > insert into cmv_basetable_n6 values > (1, 'alfred', 10.30, 2), > (1, 'charlie', 20.30, 2); {code} > # > {code:java} > create table cmv_basetable_2_n3 (a int, b varchar(256), c decimal(10,2), d > int) stored as orc TBLPROPERTIES ('transactional'='true'); {code} > # > {code:java} > insert into cmv_basetable_2_n3 values > (1, 'bob', 30.30, 2), > (1, 'bonnie', 40.30, 2);{code} > # > {code:java} > CREATE MATERIALIZED VIEW cmv_mat_view_n6 TBLPROPERTIES > ('transactional'='true') AS > SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c > FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = > cmv_basetable_2_n3.a) > WHERE cmv_basetable_2_n3.c > 10.0;{code} > # > {code:java} > show tables; {code} > !截图.PNG! > # Select tuples, including deletion and with VirtualColumn's, from the MV > and source tables. We see that the MV is correctly built upon creation: > {code:java} > SELECT ROW__IS__DELETED, ROW__ID, * FROM > cmv_mat_view_n6('acid.fetch.deleted.rows'='true');{code} > !截图1.PNG! > # > {code:java} > SELECT ROW__IS__DELETED, ROW__ID, * FROM > cmv_basetable_n6('acid.fetch.deleted.rows'='true'); {code} > !截图2.PNG! > # > {code:java} > SELECT ROW__IS__DELETED, ROW__ID, * FROM > cmv_basetable_2_n3('acid.fetch.deleted.rows'='true'); {code} > !截图3.PNG! > # Now make an insert to the LHS and a delete to the RHS source table: > {code:java} > insert into cmv_basetable_n6 values > (1, 'kevin', 50.30, 2); > DELETE FROM cmv_basetable_2_n3 WHERE b = 'bonnie';{code} > # Select again to see what happened: > {code:java} > SELECT ROW__IS__DELETED, ROW__ID, * FROM > cmv_basetable_n6('acid.fetch.deleted.rows'='true'); {code} > !截图4.PNG! > # > {code:java} > SELECT ROW__IS__DELETED, ROW__ID, * FROM > cmv_basetable_2_n3('acid.fetch.deleted.rows'='true'); {code} > !截图5.PNG! > # Use {{EXPLAIN CBO}} to produce the incremental rebuild plan for the MV, > which is incorrect already: > {code:java} > EXPLAIN CBO > ALTER MATERIALIZED VIEW cmv_mat_view_n6 REBUILD; {code} > !截图6.PNG! > # Rebuild MV and see (incorrect) results: > {code:java} > ALTER MATERIALIZED VIEW cmv_mat_view_n6 REBUILD; > SELECT ROW__IS__DELETED, ROW__ID, * FROM > cmv_mat_view_n6('acid.fetch.deleted.rows'='true');{code} > !截图7.PNG! > # Run MV definition directly, which outputs incorrect results because the MV > is enabled for MV-based query rewrite, i.e., the following query will output > what's in the MV for the time being: > {code:java} > SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c > FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = > cmv_basetable_2_n3.a) > WHERE cmv_basetable_2_n3.c > 10.0; {code} > !截图8.PNG! > # Disable MV-based query rewrite for the MV and re-run the definition, which > should give the correct results: > {code:java} > ALTER MATERIALIZED VIEW cmv_mat_view_n6 DISABLE REWRITE; > SELECT cmv_basetable_n6.a, cmv_basetable_2_n3.c > FROM cmv_basetable_n6 JOIN cmv_basetable_2_n3 ON (cmv_basetable_n6.a = > cmv_basetable_2_n3.a) > WHERE cmv_basetable_2_n3.c > 10.0;{code} > !截图9.PNG! > h1. Note > This issue is also seen in update-incurred inserts/deletes. -- This message was sent by Atlassian Jira (v8.20.10#820010)