[ https://issues.apache.org/jira/browse/HIVE-7982?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Mostafa Mokhtar updated HIVE-7982: ---------------------------------- Description: Now explain for Q17 is back in the 12 second range, I checked the queries issues to MySQL and they are very different than before on August 15 explain was completing in under 5 seconds and we issued the following queries : {code} select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"), max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"), max("DOUBLE_HIGH_VALUE"), min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), sum("NUM_NULLS"), max("NUM_DISTINCTS"), max("AVG_COL_LEN"), max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES") from "PART_COL_STATS" where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME" = 'store_returns' and "COLUMN_NAME" in ('sr_item_sk','sr_customer_sk','sr_ticket_number') AND "PARTITION_NAME" in ('sr_returned_date=1998-01-06','sr_returned_date=1998-01-07',..'sr_returned_date=2003-07-01') group by "COLUMN_NAME", "COLUMN_TYPE"; select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"), max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"), max("DOUBLE_HIGH_VALUE"), min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), sum("NUM_NULLS"), max("NUM_DISTINCTS"), max("AVG_COL_LEN"), max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES") from "PART_COL_STATS" where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME" = 'store_returns' and "COLUMN_NAME" in ('sr_returned_date_sk') AND "PARTITION_NAME" in ('sr_returned_date=1998-01-06'..'sr_returned_date=2003-07-01') group by "COLUMN_NAME", "COLUMN_TYPE" {code} Currently explain Q17 takes 11 seconds and the queries sent to MySQL are very inefficient because 1) They no longer do the aggregation on MySQL and get a row per partition 2) There is a query per stats K,V pair so the number of queries is up by 9x {code} select COLUMN_NAME, COLUMN_TYPE, count(PARTITION_NAME) from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_item_sk','sr_customer_sk','sr_ticket_number') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') group by COLUMN_NAME, COLUMN_TYPE select COLUMN_NAME, sum(NUM_NULLS), sum(NUM_TRUES), sum(NUM_FALSES) from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk','sr_item_sk','sr_ticket_number') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') group by COLUMN_NAME select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'LONG_LOW_VALUE' select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'LONG_HIGH_VALUE' select DOUBLE_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'DOUBLE_LOW_VALUE' select DOUBLE_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'DOUBLE_HIGH_VALUE' select BIG_DECIMAL_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'BIG_DECIMAL_LOW_VALUE' select BIG_DECIMAL_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'BIG_DECIMAL_HIGH_VALUE' select NUM_DISTINCTS,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'NUM_DISTINCTS' select AVG_COL_LEN,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'AVG_COL_LEN' select MAX_COL_LEN,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'MAX_COL_LEN' select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'LONG_LOW_VALUE' select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'LONG_HIGH_VALUE' {code} was: With CBO we need the correct set of indexes to provide an efficient Read/Write access. These indexes improve performance of Explain plan and Analyzed table by 60% and 300%. {code} MySQL CREATE INDEX PART_COL_STATS_N50 ON PART_COL_STATS (DB_NAME,TABLE_NAME,COLUMN_NAME) USING BTREE; MsSQL CREATE INDEX PART_COL_STATS_N50 ON PART_COL_STATS (DB_NAME,TABLE_NAME,COLUMN_NAME); Oracle CREATE INDEX PART_COL_STATS_N50 ON PART_COL_STATS (DB_NAME,TABLE_NAME,COLUMN_NAME); Postgres CREATE INDEX "PART_COL_STATS_N50" ON "PART_COL_STATS" USING btree ("DB_NAME","TABLE_NAME","COLUMN_NAME"); {code} > Regression in explain with CBO enabled due to issuing query per K,V for the > stats > --------------------------------------------------------------------------------- > > Key: HIVE-7982 > URL: https://issues.apache.org/jira/browse/HIVE-7982 > Project: Hive > Issue Type: Bug > Components: CBO > Affects Versions: 0.14.0 > Reporter: Mostafa Mokhtar > Assignee: Ashutosh Chauhan > Fix For: 0.14.0 > > > Now explain for Q17 is back in the 12 second range, I checked the queries > issues to MySQL and they are very different than before > on August 15 explain was completing in under 5 seconds and we issued the > following queries : > {code} > select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"), > max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"), max("DOUBLE_HIGH_VALUE"), > min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), > sum("NUM_NULLS"), max("NUM_DISTINCTS"), max("AVG_COL_LEN"), > max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES") from "PART_COL_STATS" > where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME" = > 'store_returns' and "COLUMN_NAME" in > ('sr_item_sk','sr_customer_sk','sr_ticket_number') AND "PARTITION_NAME" in > ('sr_returned_date=1998-01-06','sr_returned_date=1998-01-07',..'sr_returned_date=2003-07-01') > group by "COLUMN_NAME", "COLUMN_TYPE"; > select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"), > max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"), max("DOUBLE_HIGH_VALUE"), > min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), > sum("NUM_NULLS"), max("NUM_DISTINCTS"), max("AVG_COL_LEN"), > max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES") from "PART_COL_STATS" > where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME" = > 'store_returns' and "COLUMN_NAME" in ('sr_returned_date_sk') AND > "PARTITION_NAME" in > ('sr_returned_date=1998-01-06'..'sr_returned_date=2003-07-01') group by > "COLUMN_NAME", "COLUMN_TYPE" > {code} > Currently explain Q17 takes 11 seconds and the queries sent to MySQL are very > inefficient because > 1) They no longer do the aggregation on MySQL and get a row per partition > 2) There is a query per stats K,V pair so the number of queries is up by 9x > {code} > select COLUMN_NAME, COLUMN_TYPE, count(PARTITION_NAME) from > PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME > = 'store_returns' and COLUMN_NAME in > ('sr_item_sk','sr_customer_sk','sr_ticket_number') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') group by > COLUMN_NAME, COLUMN_TYPE > select COLUMN_NAME, sum(NUM_NULLS), sum(NUM_TRUES), > sum(NUM_FALSES) from PART_COL_STATS where DB_NAME = > 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and > COLUMN_NAME in ('sr_customer_sk','sr_item_sk','sr_ticket_number') and > PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') group by > COLUMN_NAME > select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where > DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' > and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'LONG_LOW_VALUE' > select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where > DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' > and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'LONG_HIGH_VALUE' > select DOUBLE_LOW_VALUE,PARTITION_NAME from PART_COL_STATS > where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = > 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'DOUBLE_LOW_VALUE' > select DOUBLE_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS > where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = > 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'DOUBLE_HIGH_VALUE' > select BIG_DECIMAL_LOW_VALUE,PARTITION_NAME from PART_COL_STATS > where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = > 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'BIG_DECIMAL_LOW_VALUE' > select BIG_DECIMAL_HIGH_VALUE,PARTITION_NAME from > PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME > = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'BIG_DECIMAL_HIGH_VALUE' > select NUM_DISTINCTS,PARTITION_NAME from PART_COL_STATS where > DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' > and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'NUM_DISTINCTS' > select AVG_COL_LEN,PARTITION_NAME from PART_COL_STATS where > DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' > and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'AVG_COL_LEN' > select MAX_COL_LEN,PARTITION_NAME from PART_COL_STATS where > DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' > and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'MAX_COL_LEN' > select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where > DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' > and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'LONG_LOW_VALUE' > select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where > DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' > and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in > ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by > 'LONG_HIGH_VALUE' > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)