Re: [PR] [feature](thrift api) support expr in MemoryScratchSink and make arrow::Schema recalculate with block info [doris]

via GitHub Tue, 17 Oct 2023 18:41:13 -0700


xiaokang commented on code in PR #24603:
URL: https://github.com/apache/doris/pull/24603#discussion_r1362993792



##########
be/src/util/arrow/row_batch.h:
##########
@@ -45,6 +46,9 @@ Status convert_to_arrow_type(const TypeDescriptor& type, 
std::shared_ptr<arrow::
 Status convert_to_arrow_schema(const RowDescriptor& row_desc,
                                std::shared_ptr<arrow::Schema>* result);
 
+Status get_block_arrow_schema(const vectorized::Block& block,

Review Comment:
   It's better to use the same name convert_to_arrow_schema to indicate 
function overload. 



##########
be/src/util/arrow/block_convertor.cpp:
##########
@@ -408,7 +410,10 @@ Status 
FromBlockConverter::convert(std::shared_ptr<arrow::RecordBatch>* out) {
 Status convert_to_arrow_batch(const vectorized::Block& block,
                               const std::shared_ptr<arrow::Schema>& schema, 
arrow::MemoryPool* pool,
                               std::shared_ptr<arrow::RecordBatch>* result) {
-    FromBlockConverter converter(block, schema, pool);
+    // FromBlockConverter converter(block, schema, pool);
+    std::shared_ptr<arrow::Schema> block_arrow_schema;
+    RETURN_IF_ERROR(get_block_arrow_schema(block, &block_arrow_schema));

Review Comment:
   Why not use schema from argument?



##########
be/src/util/arrow/block_convertor.cpp:
##########
@@ -408,7 +410,10 @@ Status 
FromBlockConverter::convert(std::shared_ptr<arrow::RecordBatch>* out) {
 Status convert_to_arrow_batch(const vectorized::Block& block,
                               const std::shared_ptr<arrow::Schema>& schema, 
arrow::MemoryPool* pool,
                               std::shared_ptr<arrow::RecordBatch>* result) {
-    FromBlockConverter converter(block, schema, pool);
+    // FromBlockConverter converter(block, schema, pool);

Review Comment:
   delete comment



##########
be/src/vec/sink/vmemory_scratch_sink.cpp:
##########
@@ -75,13 +76,18 @@ Status MemoryScratchSink::prepare(RuntimeState* state) {
     return Status::OK();
 }
 
-Status MemoryScratchSink::send(RuntimeState* state, Block* block, bool eos) {
-    if (nullptr == block || 0 == block->rows()) {
+Status MemoryScratchSink::send(RuntimeState* state, Block* input_block, bool 
eos) {
+    if (nullptr == input_block || 0 == input_block->rows()) {
         return Status::OK();
     }
     std::shared_ptr<arrow::RecordBatch> result;
+    // Exec vectorized expr here to speed up, block.rows() == 0 means expr exec
+    // failed, just return the error status
+    Block block;
+    
RETURN_IF_ERROR(VExprContext::get_output_block_after_execute_exprs(_output_vexpr_ctxs,
+                                                                       
*input_block, &block));
     RETURN_IF_ERROR(
-            convert_to_arrow_batch(*block, _arrow_schema, 
arrow::default_memory_pool(), &result));
+            convert_to_arrow_batch(block, _arrow_schema, 
arrow::default_memory_pool(), &result));

Review Comment:
   It's better to pass a new schema to convert_to_arrow_batch() function call 
instead of get new schema in convert_to_arrow_batch().



##########
be/src/vec/data_types/data_type.h:
##########
@@ -77,7 +77,7 @@ class IDataType : private boost::noncopyable {
     /// Data type id. It's used for runtime type checks.
     virtual TypeIndex get_type_id() const = 0;
 
-    virtual PrimitiveType get_type_as_primitive_type() const = 0;
+    virtual TypeDescriptor get_type_as_type_descriptor() const = 0;

Review Comment:
   Why do this API change?



##########
be/src/olap/tablet_schema.cpp:
##########
@@ -521,7 +521,7 @@ vectorized::AggregateFunctionPtr 
TabletColumn::get_aggregate_function_union(
 
 vectorized::AggregateFunctionPtr 
TabletColumn::get_aggregate_function(std::string suffix) const {
     auto type = 
vectorized::DataTypeFactory::instance().create_data_type(*this);
-    if (type && type->get_type_as_primitive_type() == 
PrimitiveType::TYPE_AGG_STATE) {
+    if (type && type->get_type_as_type_descriptor().type == 
PrimitiveType::TYPE_AGG_STATE) {

Review Comment:
   what's the difference?



##########
be/src/vec/sink/vmemory_scratch_sink.cpp:
##########
@@ -75,13 +76,18 @@ Status MemoryScratchSink::prepare(RuntimeState* state) {
     return Status::OK();
 }
 
-Status MemoryScratchSink::send(RuntimeState* state, Block* block, bool eos) {
-    if (nullptr == block || 0 == block->rows()) {
+Status MemoryScratchSink::send(RuntimeState* state, Block* input_block, bool 
eos) {
+    if (nullptr == input_block || 0 == input_block->rows()) {
         return Status::OK();
     }
     std::shared_ptr<arrow::RecordBatch> result;
+    // Exec vectorized expr here to speed up, block.rows() == 0 means expr exec
+    // failed, just return the error status
+    Block block;
+    
RETURN_IF_ERROR(VExprContext::get_output_block_after_execute_exprs(_output_vexpr_ctxs,

Review Comment:
   This line is the core of this PR, right?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Re: [PR] [feature](thrift api) support expr in MemoryScratchSink and make arrow::Schema recalculate with block info [doris]

Reply via email to