[jira] [Updated] (HIVE-25874) Slow filter evaluation of nest struct fields in vectorized executions

Zoltan Haindrich (Jira) Tue, 18 Jan 2022 02:21:06 -0800


     [ 
https://issues.apache.org/jira/browse/HIVE-25874?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Zoltan Haindrich updated HIVE-25874:
------------------------------------
    Description: 
time is spent at resizing vectors around 
[here|https://github.com/apache/hive/blob/200c0bf1feb259f4d95bf065a2ab38fe684383da/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java#L252]
 or in some other "ensureSize" method

{code:java}

create table t as
select
named_struct('id',13,'str','string','nest',named_struct('id',12,'str','string','arr',array('value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value')))
s;

-- go up to 1M rows
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
-- insert into table t select * from t union all select * from t union all 
select * from t union all select * from t union all select * from t union all 
select * from t union all select * from t union all select * from t union all 
select * from t;


set hive.fetch.task.conversion=none;

select count(1) from t;
--explain
select s
.id from t
where 
s
.nest
.id  > 0;

 {code}


interestingly; the issue is not present:
* for a query not looking into the nested struct
* and in case the struct with the array is at the top level

{code}
select count(1) from t;
--explain
select s
.id from t
where 
s
-- .nest
.id  > 0;
{code}

  was:
{code:java}

create table t as
select
named_struct('id',13,'str','string','nest',named_struct('id',12,'str','string','arr',array('value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value')))
s;

-- go up to 1M rows
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
insert into table t select * from t union all select * from t union all select 
* from t union all select * from t union all select * from t union all select * 
from t union all select * from t union all select * from t union all select * 
from t;
-- insert into table t select * from t union all select * from t union all 
select * from t union all select * from t union all select * from t union all 
select * from t union all select * from t union all select * from t union all 
select * from t;


set hive.fetch.task.conversion=none;

select count(1) from t;
--explain
select s
.id from t
where 
s
.nest
.id  > 0;

 {code}


interestingly; the issue is not present:
* for a query not looking into the nested struct
* and in case the struct with the array is at the top level

{code}
select count(1) from t;
--explain
select s
.id from t
where 
s
-- .nest
.id  > 0;
{code}


> Slow filter evaluation of nest struct fields in vectorized executions
> ---------------------------------------------------------------------
>
>                 Key: HIVE-25874
>                 URL: https://issues.apache.org/jira/browse/HIVE-25874
>             Project: Hive
>          Issue Type: Improvement
>            Reporter: Zoltan Haindrich
>            Priority: Major
>
> time is spent at resizing vectors around 
> [here|https://github.com/apache/hive/blob/200c0bf1feb259f4d95bf065a2ab38fe684383da/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java#L252]
>  or in some other "ensureSize" method
> {code:java}
> create table t as
> select
> named_struct('id',13,'str','string','nest',named_struct('id',12,'str','string','arr',array('value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value')))
> s;
> -- go up to 1M rows
> insert into table t select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t;
> insert into table t select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t;
> insert into table t select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t;
> insert into table t select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t;
> insert into table t select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t;
> -- insert into table t select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t union all select * from t union all select * from t union all 
> select * from t;
> set hive.fetch.task.conversion=none;
> select count(1) from t;
> --explain
> select s
> .id from t
> where 
> s
> .nest
> .id  > 0;
>  {code}
> interestingly; the issue is not present:
> * for a query not looking into the nested struct
> * and in case the struct with the array is at the top level
> {code}
> select count(1) from t;
> --explain
> select s
> .id from t
> where 
> s
> -- .nest
> .id  > 0;
> {code}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

[jira] [Updated] (HIVE-25874) Slow filter evaluation of nest struct fields in vectorized executions

Reply via email to