[doris] 03/03: [improvement](multi-catalog) increase default batch_size to 4064 (#16326)

morningman Thu, 02 Feb 2023 05:07:25 -0800

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git


commit fb5420c26276f5a34511d76497a3a2a1ce7ffe57
Author: Ashin Gau <ashin...@users.noreply.github.com>
AuthorDate: Thu Feb 2 11:51:09 2023 +0800

    [improvement](multi-catalog) increase default batch_size to 4064 (#16326)
    
    The performance of ClickBench Q30 is affected by batch_size:
    | batch_size | 1024 | 4096 | 20480 |
    | -- | -- | -- | -- |
    | Q30 query time | 2.27 | 1.08 | 0.62 |
    
    Because aggregation operator will create a new result block for each batch 
block, and Q30 has 90 columns, which is time-consuming. Larger batch_size will 
decrease the number of aggregation blocks, so the larger batch_size will 
improve performance.
    
    Doris internal reader will read at least 4064 rows even if batch_size < 
4064, so this PR keep the process of reading external table the same  as 
internal table.
---
 be/src/vec/exec/format/csv/csv_reader.cpp                         | 2 +-
 be/src/vec/exec/format/generic_reader.h                           | 2 ++
 be/src/vec/exec/format/json/new_json_reader.cpp                   | 2 +-
 be/src/vec/exec/format/orc/vorc_reader.cpp                        | 2 +-
 be/src/vec/exec/format/parquet/vparquet_reader.cpp                | 2 +-
 fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java | 4 ++--
 6 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index d811866d13..c7099b24c7 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -188,7 +188,7 @@ Status CsvReader::get_next_block(Block* block, size_t* 
read_rows, bool* eof) {
         return Status::OK();
     }
 
-    const int batch_size = _state->batch_size();
+    const int batch_size = std::max(_state->batch_size(), 
(int)_MIN_BATCH_SIZE);
     size_t rows = 0;
     auto columns = block->mutate_columns();
     while (rows < batch_size && !_line_reader_eof) {
diff --git a/be/src/vec/exec/format/generic_reader.h 
b/be/src/vec/exec/format/generic_reader.h
index 30e93aacd8..9f4cfd00ee 100644
--- a/be/src/vec/exec/format/generic_reader.h
+++ b/be/src/vec/exec/format/generic_reader.h
@@ -60,6 +60,8 @@ public:
     }
 
 protected:
+    const size_t _MIN_BATCH_SIZE = 4064; // 4094 - 32(padding)
+
     /// Whether the underlying FileReader has filled the partition&missing 
columns
     bool _fill_all_columns = false;
 };
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp 
b/be/src/vec/exec/format/json/new_json_reader.cpp
index 68a3f089e5..0ed5a0aeb0 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -105,7 +105,7 @@ Status NewJsonReader::get_next_block(Block* block, size_t* 
read_rows, bool* eof)
         return Status::OK();
     }
 
-    const int batch_size = _state->batch_size();
+    const int batch_size = std::max(_state->batch_size(), 
(int)_MIN_BATCH_SIZE);
     auto columns = block->mutate_columns();
 
     while (columns[0]->size() < batch_size && !_reader_eof) {
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index c295712491..f313cb60f0 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -72,7 +72,7 @@ OrcReader::OrcReader(RuntimeProfile* profile, const 
TFileScanRangeParams& params
         : _profile(profile),
           _scan_params(params),
           _scan_range(range),
-          _batch_size(batch_size),
+          _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)),
           _range_start_offset(range.start_offset),
           _range_size(range.size),
           _ctz(ctz),
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 7881eebe2d..cfc904d607 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -36,7 +36,7 @@ ParquetReader::ParquetReader(RuntimeProfile* profile, const 
TFileScanRangeParams
         : _profile(profile),
           _scan_params(params),
           _scan_range(range),
-          _batch_size(batch_size),
+          _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)),
           _range_start_offset(range.start_offset),
           _range_size(range.size),
           _ctz(ctz) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index bfaefd8ac5..db99c90e26 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -384,9 +384,9 @@ public class SessionVariable implements Serializable, 
Writable {
     @VariableMgr.VarAttr(name = CODEGEN_LEVEL)
     public int codegenLevel = 0;
 
-    // 1024 minus 16 + 16 bytes padding that in padding pod array
+    // 4096 minus 16 + 16 bytes padding that in padding pod array
     @VariableMgr.VarAttr(name = BATCH_SIZE)
-    public int batchSize = 992;
+    public int batchSize = 4064;
 
     @VariableMgr.VarAttr(name = DISABLE_STREAMING_PREAGGREGATIONS)
     public boolean disableStreamPreaggregations = false;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[doris] 03/03: [improvement](multi-catalog) increase default batch_size to 4064 (#16326)

Reply via email to