xiaokang commented on code in PR #24603: URL: https://github.com/apache/doris/pull/24603#discussion_r1362993792
########## be/src/util/arrow/row_batch.h: ########## @@ -45,6 +46,9 @@ Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr<arrow:: Status convert_to_arrow_schema(const RowDescriptor& row_desc, std::shared_ptr<arrow::Schema>* result); +Status get_block_arrow_schema(const vectorized::Block& block, Review Comment: It's better to use the same name convert_to_arrow_schema to indicate function overload. ########## be/src/util/arrow/block_convertor.cpp: ########## @@ -408,7 +410,10 @@ Status FromBlockConverter::convert(std::shared_ptr<arrow::RecordBatch>* out) { Status convert_to_arrow_batch(const vectorized::Block& block, const std::shared_ptr<arrow::Schema>& schema, arrow::MemoryPool* pool, std::shared_ptr<arrow::RecordBatch>* result) { - FromBlockConverter converter(block, schema, pool); + // FromBlockConverter converter(block, schema, pool); + std::shared_ptr<arrow::Schema> block_arrow_schema; + RETURN_IF_ERROR(get_block_arrow_schema(block, &block_arrow_schema)); Review Comment: Why not use schema from argument? ########## be/src/util/arrow/block_convertor.cpp: ########## @@ -408,7 +410,10 @@ Status FromBlockConverter::convert(std::shared_ptr<arrow::RecordBatch>* out) { Status convert_to_arrow_batch(const vectorized::Block& block, const std::shared_ptr<arrow::Schema>& schema, arrow::MemoryPool* pool, std::shared_ptr<arrow::RecordBatch>* result) { - FromBlockConverter converter(block, schema, pool); + // FromBlockConverter converter(block, schema, pool); Review Comment: delete comment ########## be/src/vec/sink/vmemory_scratch_sink.cpp: ########## @@ -75,13 +76,18 @@ Status MemoryScratchSink::prepare(RuntimeState* state) { return Status::OK(); } -Status MemoryScratchSink::send(RuntimeState* state, Block* block, bool eos) { - if (nullptr == block || 0 == block->rows()) { +Status MemoryScratchSink::send(RuntimeState* state, Block* input_block, bool eos) { + if (nullptr == input_block || 0 == input_block->rows()) { return Status::OK(); } std::shared_ptr<arrow::RecordBatch> result; + // Exec vectorized expr here to speed up, block.rows() == 0 means expr exec + // failed, just return the error status + Block block; + RETURN_IF_ERROR(VExprContext::get_output_block_after_execute_exprs(_output_vexpr_ctxs, + *input_block, &block)); RETURN_IF_ERROR( - convert_to_arrow_batch(*block, _arrow_schema, arrow::default_memory_pool(), &result)); + convert_to_arrow_batch(block, _arrow_schema, arrow::default_memory_pool(), &result)); Review Comment: It's better to pass a new schema to convert_to_arrow_batch() function call instead of get new schema in convert_to_arrow_batch(). ########## be/src/vec/data_types/data_type.h: ########## @@ -77,7 +77,7 @@ class IDataType : private boost::noncopyable { /// Data type id. It's used for runtime type checks. virtual TypeIndex get_type_id() const = 0; - virtual PrimitiveType get_type_as_primitive_type() const = 0; + virtual TypeDescriptor get_type_as_type_descriptor() const = 0; Review Comment: Why do this API change? ########## be/src/olap/tablet_schema.cpp: ########## @@ -521,7 +521,7 @@ vectorized::AggregateFunctionPtr TabletColumn::get_aggregate_function_union( vectorized::AggregateFunctionPtr TabletColumn::get_aggregate_function(std::string suffix) const { auto type = vectorized::DataTypeFactory::instance().create_data_type(*this); - if (type && type->get_type_as_primitive_type() == PrimitiveType::TYPE_AGG_STATE) { + if (type && type->get_type_as_type_descriptor().type == PrimitiveType::TYPE_AGG_STATE) { Review Comment: what's the difference? ########## be/src/vec/sink/vmemory_scratch_sink.cpp: ########## @@ -75,13 +76,18 @@ Status MemoryScratchSink::prepare(RuntimeState* state) { return Status::OK(); } -Status MemoryScratchSink::send(RuntimeState* state, Block* block, bool eos) { - if (nullptr == block || 0 == block->rows()) { +Status MemoryScratchSink::send(RuntimeState* state, Block* input_block, bool eos) { + if (nullptr == input_block || 0 == input_block->rows()) { return Status::OK(); } std::shared_ptr<arrow::RecordBatch> result; + // Exec vectorized expr here to speed up, block.rows() == 0 means expr exec + // failed, just return the error status + Block block; + RETURN_IF_ERROR(VExprContext::get_output_block_after_execute_exprs(_output_vexpr_ctxs, Review Comment: This line is the core of this PR, right? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org