This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-1.2-lts in repository https://gitbox.apache.org/repos/asf/doris.git
commit fb5420c26276f5a34511d76497a3a2a1ce7ffe57 Author: Ashin Gau <ashin...@users.noreply.github.com> AuthorDate: Thu Feb 2 11:51:09 2023 +0800 [improvement](multi-catalog) increase default batch_size to 4064 (#16326) The performance of ClickBench Q30 is affected by batch_size: | batch_size | 1024 | 4096 | 20480 | | -- | -- | -- | -- | | Q30 query time | 2.27 | 1.08 | 0.62 | Because aggregation operator will create a new result block for each batch block, and Q30 has 90 columns, which is time-consuming. Larger batch_size will decrease the number of aggregation blocks, so the larger batch_size will improve performance. Doris internal reader will read at least 4064 rows even if batch_size < 4064, so this PR keep the process of reading external table the same as internal table. --- be/src/vec/exec/format/csv/csv_reader.cpp | 2 +- be/src/vec/exec/format/generic_reader.h | 2 ++ be/src/vec/exec/format/json/new_json_reader.cpp | 2 +- be/src/vec/exec/format/orc/vorc_reader.cpp | 2 +- be/src/vec/exec/format/parquet/vparquet_reader.cpp | 2 +- fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java | 4 ++-- 6 files changed, 8 insertions(+), 6 deletions(-) diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index d811866d13..c7099b24c7 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -188,7 +188,7 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { return Status::OK(); } - const int batch_size = _state->batch_size(); + const int batch_size = std::max(_state->batch_size(), (int)_MIN_BATCH_SIZE); size_t rows = 0; auto columns = block->mutate_columns(); while (rows < batch_size && !_line_reader_eof) { diff --git a/be/src/vec/exec/format/generic_reader.h b/be/src/vec/exec/format/generic_reader.h index 30e93aacd8..9f4cfd00ee 100644 --- a/be/src/vec/exec/format/generic_reader.h +++ b/be/src/vec/exec/format/generic_reader.h @@ -60,6 +60,8 @@ public: } protected: + const size_t _MIN_BATCH_SIZE = 4064; // 4094 - 32(padding) + /// Whether the underlying FileReader has filled the partition&missing columns bool _fill_all_columns = false; }; diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 68a3f089e5..0ed5a0aeb0 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -105,7 +105,7 @@ Status NewJsonReader::get_next_block(Block* block, size_t* read_rows, bool* eof) return Status::OK(); } - const int batch_size = _state->batch_size(); + const int batch_size = std::max(_state->batch_size(), (int)_MIN_BATCH_SIZE); auto columns = block->mutate_columns(); while (columns[0]->size() < batch_size && !_reader_eof) { diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index c295712491..f313cb60f0 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -72,7 +72,7 @@ OrcReader::OrcReader(RuntimeProfile* profile, const TFileScanRangeParams& params : _profile(profile), _scan_params(params), _scan_range(range), - _batch_size(batch_size), + _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)), _range_start_offset(range.start_offset), _range_size(range.size), _ctz(ctz), diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 7881eebe2d..cfc904d607 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -36,7 +36,7 @@ ParquetReader::ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams : _profile(profile), _scan_params(params), _scan_range(range), - _batch_size(batch_size), + _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)), _range_start_offset(range.start_offset), _range_size(range.size), _ctz(ctz) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index bfaefd8ac5..db99c90e26 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -384,9 +384,9 @@ public class SessionVariable implements Serializable, Writable { @VariableMgr.VarAttr(name = CODEGEN_LEVEL) public int codegenLevel = 0; - // 1024 minus 16 + 16 bytes padding that in padding pod array + // 4096 minus 16 + 16 bytes padding that in padding pod array @VariableMgr.VarAttr(name = BATCH_SIZE) - public int batchSize = 992; + public int batchSize = 4064; @VariableMgr.VarAttr(name = DISABLE_STREAMING_PREAGGREGATIONS) public boolean disableStreamPreaggregations = false; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org