[ https://issues.apache.org/jira/browse/HIVE-26368?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Stamatis Zampetakis updated HIVE-26368: --------------------------------------- Component/s: CBO > DISTINCT keyword from Count UDF is removed from query plan when CBO is enabled > ------------------------------------------------------------------------------ > > Key: HIVE-26368 > URL: https://issues.apache.org/jira/browse/HIVE-26368 > Project: Hive > Issue Type: Bug > Components: CBO > Reporter: Taraka Rama Rao Lethavadla > Priority: Major > > {*}Reproduction steps{*}: > _cat ql/src/test/queries/clientpositive/ctas_distinct.q_ > {noformat} > create database test_db; > create table test_db.test_tb > (col1 string, > col2 int, > col3 int, > col4 date); > insert into test_db.test_tb values('a',1,2,'2022-01-01'); > insert into test_db.test_tb values('a',11,2,'2022-01-02'); > insert into test_db.test_tb values('a',1,2,'2022-01-01'); > insert into test_db.test_tb values('a',1,22,'2022-01-02'); > insert into test_db.test_tb values('a',11,2,'2022-01-01'); > set hive.log.explain.output=true; > create table test as > SELECT col1 > , col2 > , COUNT(DISTINCT col3, col4) AS susp_visit_count > FROM test_db.test_tb > GROUP BY col1 > , col2;{noformat} > When we run the above test case, > {noformat} > mvn install -Pitests -pl itests/qtest -Dtest=TestMiniLlapLocalCliDriver > -Dqfile=ctas_distinct.q -Dtest.output.overwrite{noformat} > the below exception is thrown > {noformat} > 2022-07-04T09:22:02,949 ERROR [76039186-5579-4a9b-b787-6d92083f1bb9 main] > parse.CalcitePlanner: CBO failed, skipping CBO. > org.apache.hadoop.hive.ql.exec.UDFArgumentException: DISTINCT keyword must be > specified > at > org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount.getEvaluator(GenericUDAFCount.java:73) > ~[hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getGenericUDAFEvaluator(FunctionRegistry.java:1255) > ~[hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.getGenericUDAFEvaluator2(SemanticAnalyzer.java:4974) > ~[hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.getGenericUDAFEvaluator(SemanticAnalyzer.java:4966) > ~[hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genGroupByPlanMapGroupByOperator(SemanticAnalyzer.java:5651) > ~[hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genGroupByPlanMapAggrNoSkew(SemanticAnalyzer.java:6578) > ~[hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genBodyPlan(SemanticAnalyzer.java:11077) > ~[hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:12009) > ~[hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:11875) > ~[hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:631) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:12714) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:460) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:317) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at org.apache.hadoop.hive.ql.Compiler.analyze(Compiler.java:224) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at org.apache.hadoop.hive.ql.Compiler.compile(Compiler.java:106) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:507) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:459) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:424) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:418) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.reexec.ReExecDriver.compileAndRespond(ReExecDriver.java:121) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:227) > [hive-exec-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:255) > [hive-cli-4.0.0-alpha-2-SNAPSHOT.jar:?] > at > org.apache.hadoop.hive.cli.CliDriver.processCmd1(CliDriver.java:200) > [hive-cli-4.0.0-alpha-2-SNAPSHOT.jar:?] > at > org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:126) > [hive-cli-4.0.0-alpha-2-SNAPSHOT.jar:?] > at > org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:421) > [hive-cli-4.0.0-alpha-2-SNAPSHOT.jar:?] > at > org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:352) > [hive-cli-4.0.0-alpha-2-SNAPSHOT.jar:?] > at > org.apache.hadoop.hive.ql.QTestUtil.executeClientInternal(QTestUtil.java:727) > [hive-it-util-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT] > at > org.apache.hadoop.hive.ql.QTestUtil.executeClient(QTestUtil.java:697) > [hive-it-util-4.0.0-alpha-2-SNAPSHOT.jar:4.0.0-alpha-2-SNAPSHOT]{noformat} > *Snippets from Log* > > {noformat} > 2022-07-04T09:22:02,848 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > parse.CalcitePlanner: Initial CBO Plan: > HiveProject(col1=[-zsh], col2=[], susp_visit_count=[]) > HiveAggregate(group=[{0, 1}], agg#0=[count(DISTINCT , )]) > HiveProject(=[-zsh], =[], =[], =[]) > HiveTableScan(table=[[test_db, test_tb]], table:alias=[test_tb]) > ... > ... > 2022-07-04T09:22:02,900 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > calcite.RelOptHiveTable: Stats for column col1 in table test_tb stored in > cache > 2022-07-04T09:22:02,900 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > calcite.RelOptHiveTable: colName: col1 colType: string countDistincts: 1 > numNulls: 0 avgColLen: 1.0 numTrues: 0 numFalses: 0 isPrimaryKey: false > isEstimated: false > 2022-07-04T09:22:02,900 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > calcite.RelOptHiveTable: Stats for column col2 in table test_tb stored in > cache > 2022-07-04T09:22:02,900 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > calcite.RelOptHiveTable: colName: col2 colType: int countDistincts: 2 > numNulls: 0 avgColLen: 4.0 numTrues: 0 numFalses: 0 Range: [ min: 1 max: 11 ] > isPrimaryKey: false isEstimated: false > 2022-07-04T09:22:02,900 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > calcite.RelOptHiveTable: Stats for column col3 in table test_tb stored in > cache > 2022-07-04T09:22:02,900 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > calcite.RelOptHiveTable: colName: col3 colType: int countDistincts: 2 > numNulls: 0 avgColLen: 4.0 numTrues: 0 numFalses: 0 Range: [ min: 2 max: 22 ] > isPrimaryKey: false isEstimated: false > 2022-07-04T09:22:02,900 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > calcite.RelOptHiveTable: Stats for column col4 in table test_tb stored in > cache > 2022-07-04T09:22:02,900 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > calcite.RelOptHiveTable: colName: col4 colType: date countDistincts: 0 > numNulls: 0 avgColLen: 56.0 numTrues: 0 numFalses: 0 Range: [ min: 18993 max: > 18994 ] isPrimaryKey: false isEstimated: false > 2022-07-04T09:22:02,900 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > rules.HiveRelFieldTrimmer: Got col stats for [0, 1, 2, 3] in [test_db, > test_tb] > 2022-07-04T09:22:02,924 DEBUG [76039186-5579-4a9b-b787-6d92083f1bb9 main] > rules.RelFieldTrimmer: Plan after trimming unused fields > HiveAggregate(group=[{0, 1}], agg#0=[count(, )]) > HiveAggregate(group=[{0, 1, 2, 3}]) > HiveProject(=[-zsh], =[], =[], =[]) > HiveTableScan(table=[[test_db, test_tb]], table:alias=[test_tb]) > {noformat} > So i believe Distinct is being skipped from the plan, when we turn off CBO, > then the query works as expected > -- This message was sent by Atlassian Jira (v8.20.10#820010)