[ https://issues.apache.org/jira/browse/HIVE-12827?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15102353#comment-15102353 ]
Gopal V commented on HIVE-12827: -------------------------------- The fill after every operation is unnecessary, the theory about the other patch was that some UDF in that query wasn't handling the hasNoNulls flag. That is not true, the original issue was that a scratch column feeding the COALESCE() is reused for the Join output columns & setting a column value via FloatReader does not set the isNull[batchIndex] = false (and then the Filter on cr_return_amount removes those rows). CBO rewrites the left outer join into an inner join, pushing the filter below the join & there's no more TS-FIL-MJ-FIL as the FIL migrates to the broadcast side. Here's my simplified example, which I used to narrow down the issue to FloatTreeReader. {code} set hive.cbo.enable=false; set hive.vectorized.execution.reducesink.new.enabled=false; set hive.vectorized.execution.mapjoin.native.enabled=true; set hive.vectorized.execution.reduce.enabled=false; set hive.vectorized.execution.reduce.groupby.enabled=false; use testing; create table if not exists cs stored as orc as select IF (cs_item_sk IN ( 1365 , 2243 , 2445 , 3259 , 3267 , 4027 , 5263 , 6003 , 8371 , 9593 , 10383, 10763, 11351, 12359, 12887, 13449, 16501, 16547 ), cs_item_sk, 0) as cs_item_sk , cs_order_number, cs_net_paid, cs_quantity from tpcds_bin_partitioned_orc_200.catalog_sales where true and cs_sold_date_sk = 2452245 and cs_net_profit > 1 and cs_net_paid > 0 and cs_quantity > 0 and cs_item_sk between 1365 and 16547 ; create table if not exists cr as select cr_return_amount, cr_item_sk, cr_order_number from tpcds_bin_partitioned_orc_200.catalog_returns where cr_returned_date_sk between 2452351 and 2452400 and cr_item_sk IN ( 1365 , 2243 , 2445 , 3259 , 3267 , 4027 , 5263 , 6003 , 8371 , 9593 , 10383, 10763, 11351, 12359, 12887, 13449, 16501, 16547 ) order by cr_item_sk ; select * from (select cs.cs_item_sk as item, coalesce(cr.cr_return_amount,0) as return_amount ,coalesce(cs.cs_net_paid,0) as net_paid -- (cast(sum(coalesce(cr.cr_return_amount,0)) as double)/ -- cast(sum(coalesce(cs.cs_net_paid,0)) as double)) as currency_ratio from cs -- catalog_sales cs left outer join cr -- catalog_returns cr on cs.cs_order_number = cr.cr_order_number and cs.cs_item_sk = cr.cr_item_sk where cr.cr_return_amount > 10000 and cs.cs_quantity > 0 -- group by cs.cs_item_sk ) x; {code} > Vectorization: VectorCopyRow/VectorAssignRow/VectorDeserializeRow assign > needs explicit isNull[offset] modification > ------------------------------------------------------------------------------------------------------------------- > > Key: HIVE-12827 > URL: https://issues.apache.org/jira/browse/HIVE-12827 > Project: Hive > Issue Type: Bug > Reporter: Gopal V > Assignee: Gopal V > Attachments: HIVE-12827.2.patch > > > Some scenarios do set Double.NaN instead of isNull=true, but all types aren't > consistent. > Examples of un-set isNull for the valid values are > {code} > private class FloatReader extends AbstractDoubleReader { > FloatReader(int columnIndex) { > super(columnIndex); > } > @Override > void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { > DoubleColumnVector colVector = (DoubleColumnVector) > batch.cols[columnIndex]; > if (deserializeRead.readCheckNull()) { > VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); > } else { > float value = deserializeRead.readFloat(); > colVector.vector[batchIndex] = (double) value; > } > } > } > {code} > {code} > private class DoubleCopyRow extends CopyRow { > DoubleCopyRow(int inColumnIndex, int outColumnIndex) { > super(inColumnIndex, outColumnIndex); > } > @Override > void copy(VectorizedRowBatch inBatch, int inBatchIndex, > VectorizedRowBatch outBatch, int outBatchIndex) { > DoubleColumnVector inColVector = (DoubleColumnVector) > inBatch.cols[inColumnIndex]; > DoubleColumnVector outColVector = (DoubleColumnVector) > outBatch.cols[outColumnIndex]; > if (inColVector.isRepeating) { > if (inColVector.noNulls || !inColVector.isNull[0]) { > outColVector.vector[outBatchIndex] = inColVector.vector[0]; > } else { > VectorizedBatchUtil.setNullColIsNullValue(outColVector, > outBatchIndex); > } > } else { > if (inColVector.noNulls || !inColVector.isNull[inBatchIndex]) { > outColVector.vector[outBatchIndex] = > inColVector.vector[inBatchIndex]; > } else { > VectorizedBatchUtil.setNullColIsNullValue(outColVector, > outBatchIndex); > } > } > } > } > {code} > {code} > private static abstract class VectorDoubleColumnAssign > extends VectorColumnAssignVectorBase<DoubleColumnVector> { > protected void assignDouble(double value, int destIndex) { > outCol.vector[destIndex] = value; > } > } > {code} > The pattern to imitate would be the earlier code from VectorBatchUtil > {code} > case DOUBLE: { > DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[offset + > colIndex]; > if (writableCol != null) { > dcv.vector[rowIndex] = ((DoubleWritable) writableCol).get(); > dcv.isNull[rowIndex] = false; > } else { > dcv.vector[rowIndex] = Double.NaN; > setNullColIsNullValue(dcv, rowIndex); > } > } > break; > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)