Taraka Rama Rao Lethavadla created HIVE-28066:
-------------------------------------------------

             Summary: Incorrect results when retrieving values from parquet 
table with bigint/int arrays and order by clause
                 Key: HIVE-28066
                 URL: https://issues.apache.org/jira/browse/HIVE-28066
             Project: Hive
          Issue Type: Bug
            Reporter: Taraka Rama Rao Lethavadla


{noformat}
CREATE EXTERNAL TABLE `array_test`(
`c1` array<bigint>,
`c2` array<string>)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat';{noformat}



{noformat}
insert into array_test select array(cast(NULL as bigint)), array(cast(NULL as 
string));
insert into array_test select collect_set(cast(NULL as bigint)), 
collect_set(cast(NULL as string));
insert into array_test select array(cast(0 as bigint)), array(cast('A' as 
string));
insert into array_test select array(cast(1 as bigint)), array(cast(1 as 
string));{noformat}


select * from array_test; produces correct result but not the below query


{noformat}
select c1,c2 from array_test order by c1,c2;{noformat}
Result looks like below
 
{noformat}
PREHOOK: query: CREATE EXTERNAL TABLE `array_test`(
`c1` array<bigint>,
`c2` array<string>)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@array_test
POSTHOOK: query: CREATE EXTERNAL TABLE `array_test`(
`c1` array<bigint>,
`c2` array<string>)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@array_test
PREHOOK: query: insert into array_test select array(cast(NULL as bigint)), 
array(cast(NULL as string))
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@array_test
POSTHOOK: query: insert into array_test select array(cast(NULL as bigint)), 
array(cast(NULL as string))
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@array_test
POSTHOOK: Lineage: array_test.c1 EXPRESSION []
POSTHOOK: Lineage: array_test.c2 EXPRESSION []
PREHOOK: query: insert into array_test select collect_set(cast(NULL as 
bigint)), collect_set(cast(NULL as string))
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@array_test
POSTHOOK: query: insert into array_test select collect_set(cast(NULL as 
bigint)), collect_set(cast(NULL as string))
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@array_test
POSTHOOK: Lineage: array_test.c1 EXPRESSION []
POSTHOOK: Lineage: array_test.c2 EXPRESSION []
PREHOOK: query: insert into array_test select array(cast(0 as bigint)), 
array(cast('A' as string))
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@array_test
POSTHOOK: query: insert into array_test select array(cast(0 as bigint)), 
array(cast('A' as string))
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@array_test
POSTHOOK: Lineage: array_test.c1 EXPRESSION []
POSTHOOK: Lineage: array_test.c2 EXPRESSION []
PREHOOK: query: insert into array_test select array(cast(1 as bigint)), 
array(cast(1 as string))
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@array_test
POSTHOOK: query: insert into array_test select array(cast(1 as bigint)), 
array(cast(1 as string))
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@array_test
POSTHOOK: Lineage: array_test.c1 EXPRESSION []
POSTHOOK: Lineage: array_test.c2 EXPRESSION []
PREHOOK: query: select * from array_test
PREHOOK: type: QUERY
PREHOOK: Input: default@array_test
#### A masked pattern was here ####
POSTHOOK: query: select * from array_test
POSTHOOK: type: QUERY
POSTHOOK: Input: default@array_test
#### A masked pattern was here ####
[null] [null]
[] []
[0] ["A"]
[1] ["1"]
PREHOOK: query: select c1,c2 from array_test order by c1,c2
PREHOOK: type: QUERY
PREHOOK: Input: default@array_test
#### A masked pattern was here ####
POSTHOOK: query: select c1,c2 from array_test order by c1,c2
POSTHOOK: type: QUERY
POSTHOOK: Input: default@array_test
#### A masked pattern was here ####
[0] ["A"]
[0] NULL
[0] NULL
[1] ["1"]{noformat}
When we disable vectorization, the result is as expected
{noformat}
set hive.vectorized.execution.enabled=false;
select c1,c2 from array_test order by c1,c2;{noformat}
{noformat}
PREHOOK: query: select c1,c2 from array_test order by c1,c2
PREHOOK: type: QUERY
PREHOOK: Input: default@array_test
#### A masked pattern was here ####
POSTHOOK: query: select c1,c2 from array_test order by c1,c2
POSTHOOK: type: QUERY
POSTHOOK: Input: default@array_test
#### A masked pattern was here ####
[] []
[0] ["A"]
[1] ["1"]
NULL NULL{noformat}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to