This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push: new 00fe9de [Benchmark] Add star schema benchmark tools (#6925) 00fe9de is described below commit 00fe9deaebe70c84a1207273388d58f99cb17cd0 Author: Mingyu Chen <morningman....@gmail.com> AuthorDate: Wed Oct 27 09:55:36 2021 +0800 [Benchmark] Add star schema benchmark tools (#6925) This CL mainly changes: 1. Add star schema benchmark tools in `tools/ssb-tools`, for user to easy load and test with SSB data set. 2. Disable the segment cache for some read scenario such as compaction and alter operation.(Fix #6924 ) 3. Fix a bug that `max_segment_num_per_rowset` won't work(Fix #6926) 4. Enable `enable_batch_delete_by_default` by default. --- be/src/agent/cgroups_mgr.cpp | 4 +- be/src/common/config.h | 2 +- be/src/exec/tablet_sink.cpp | 32 ++-- be/src/exec/tablet_sink.h | 12 +- be/src/olap/delta_writer.cpp | 3 + be/src/olap/rowset/beta_rowset_reader.cpp | 5 +- be/src/olap/segment_loader.cpp | 24 +-- be/src/olap/segment_loader.h | 27 +++- docs/.vuepress/sidebar/en.js | 3 +- docs/.vuepress/sidebar/zh-CN.js | 3 +- docs/en/administrator-guide/config/be_config.md | 2 +- docs/en/best-practices/star-schema-benchmark.md | 163 ++++++++++++++++++++ docs/zh-CN/administrator-guide/config/be_config.md | 2 +- docs/zh-CN/best-practices/star-schema-benchmark.md | 164 +++++++++++++++++++++ .../main/java/org/apache/doris/common/Config.java | 2 +- tools/ssb-tools/build-ssb-dbgen.sh | 53 +++++++ tools/ssb-tools/create-tables.sql | 117 +++++++++++++++ tools/ssb-tools/doris-cluster.conf | 27 ++++ tools/ssb-tools/gen-ssb-data.sh | 112 ++++++++++++++ tools/ssb-tools/load-dimension-data.sh | 68 +++++++++ tools/ssb-tools/load-fact-data.sh | 126 ++++++++++++++++ tools/ssb-tools/queries/q1.1.sql | 24 +++ tools/ssb-tools/queries/q1.2.sql | 24 +++ tools/ssb-tools/queries/q1.3.sql | 25 ++++ tools/ssb-tools/queries/q2.1.sql | 26 ++++ tools/ssb-tools/queries/q2.2.sql | 27 ++++ tools/ssb-tools/queries/q2.3.sql | 26 ++++ tools/ssb-tools/queries/q3.1.sql | 28 ++++ tools/ssb-tools/queries/q3.2.sql | 28 ++++ tools/ssb-tools/queries/q3.3.sql | 30 ++++ tools/ssb-tools/queries/q3.4.sql | 30 ++++ tools/ssb-tools/queries/q4.1.sql | 30 ++++ tools/ssb-tools/queries/q4.2.sql | 31 ++++ tools/ssb-tools/queries/q4.3.sql | 29 ++++ 34 files changed, 1259 insertions(+), 50 deletions(-) diff --git a/be/src/agent/cgroups_mgr.cpp b/be/src/agent/cgroups_mgr.cpp index edb274f..ffcbe21 100644 --- a/be/src/agent/cgroups_mgr.cpp +++ b/be/src/agent/cgroups_mgr.cpp @@ -325,8 +325,8 @@ AgentStatus CgroupsMgr::init_cgroups() { _is_cgroups_init_success = true; return AgentStatus::DORIS_SUCCESS; } else { - LOG(WARNING) << "Could not find a valid cgroups path for resource isolation," - << "current value is " << _root_cgroups_path << ". ignore it."; + VLOG_NOTICE << "Could not find a valid cgroups path for resource isolation," + << "current value is " << _root_cgroups_path << ". ignore it."; _is_cgroups_init_success = false; return AgentStatus::DORIS_ERROR; } diff --git a/be/src/common/config.h b/be/src/common/config.h index 37e71d9..42c43fb 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -629,7 +629,7 @@ CONF_Int32(send_batch_thread_pool_queue_size, "102400"); // so if there are too many segment in a rowset, the compaction process // will run out of memory. // When doing compaction, each segment may take at least 1MB buffer. -CONF_mInt32(max_segment_num_per_rowset, "100"); +CONF_mInt32(max_segment_num_per_rowset, "200"); // The connection timeout when connecting to external table such as odbc table. CONF_mInt32(external_table_connect_timeout_sec, "5"); diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index d282058..c5c02d2 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -163,10 +163,7 @@ Status NodeChannel::open_wait() { // add batch closure _add_batch_closure = ReusableClosure<PTabletWriterAddBatchResult>::create(); _add_batch_closure->addFailedHandler([this]() { - std::stringstream ss; - ss << name() << " add batch req rpc failed, " << print_load_info() - << ", node=" << node_info()->id << ":" << node_info()->brpc_port; - _cancel_with_msg(ss.str()); + _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), _add_batch_closure->cntl.ErrorText())); }); _add_batch_closure->addSuccessHandler([this](const PTabletWriterAddBatchResult& result, @@ -183,14 +180,7 @@ Status NodeChannel::open_wait() { _add_batches_finished = true; } } else { - std::stringstream ss; - // FIXME(cmy): There is a problem that when calling node_info, the node_info seems not initialized. - // But I don't know why. so here I print node_info()->id instead of node_info()->host - // to avoid BE crash. It needs further observation. - ss << name() << " add batch req success but status isn't ok, " << print_load_info() - << ", backend id=" << node_info()->id << ":" << node_info()->brpc_port - << ", errmsg=" << status.get_error_msg(); - _cancel_with_msg(ss.str()); + _cancel_with_msg(fmt::format("{}, add batch req success but status isn't ok, err: {}", channel_info(), status.get_error_msg())); } if (result.has_execution_time_us()) { @@ -317,10 +307,10 @@ Status NodeChannel::close_wait(RuntimeState* state) { return Status::InternalError(ss.str()); } -void NodeChannel::cancel() { +void NodeChannel::cancel(const std::string& cancel_msg) { // we don't need to wait last rpc finished, cause closure's release/reset will join. // But do we need brpc::StartCancel(call_id)? - _cancelled = true; + _cancel_with_msg(cancel_msg); PTabletWriterCancelRequest request; request.set_allocated_id(&_parent->_load_id); @@ -384,7 +374,7 @@ void NodeChannel::try_send_batch() { int remain_ms = _rpc_timeout_ms - _timeout_watch.elapsed_time() / NANOS_PER_MILLIS; if (UNLIKELY(remain_ms < _min_rpc_timeout_ms)) { if (remain_ms <= 0 && !request.eos()) { - cancel(); + cancel(fmt::format("{}, err: timeout", channel_info())); } else { remain_ms = _min_rpc_timeout_ms; } @@ -672,12 +662,10 @@ Status OlapTableSink::open(RuntimeState* state) { auto st = ch->open_wait(); if (!st.ok()) { std::stringstream err; - err << ch->name() << ": tablet open failed, " << ch->print_load_info() - << ", node=" << ch->node_info()->host << ":" << ch->node_info()->brpc_port - << ", errmsg=" << st.get_error_msg(); + err << ch->channel_info() << ", tablet open failed, err: " << st.get_error_msg(); LOG(WARNING) << err.str(); - index_channel->mark_as_failed(ch); ss << err.str() << "; "; + index_channel->mark_as_failed(ch); } }); @@ -786,9 +774,7 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { if (!s.ok()) { // 'status' will store the last non-ok status of all channels status = s; - LOG(WARNING) << ch->name() << ": close channel failed, " - << ch->print_load_info() - << ". error_msg=" << s.get_error_msg(); + LOG(WARNING) << ch->channel_info() << ", close channel failed, err: " << s.get_error_msg(); } ch->time_report(&node_add_batch_counter_map, &serialize_batch_ns, &mem_exceeded_block_ns, &queue_push_lock_ns, @@ -837,7 +823,7 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { LOG(INFO) << ss.str(); } else { for (auto channel : _channels) { - channel->for_each_node_channel([](NodeChannel* ch) { ch->cancel(); }); + channel->for_each_node_channel([&status](NodeChannel* ch) { ch->cancel(status.get_error_msg()); }); } } diff --git a/be/src/exec/tablet_sink.h b/be/src/exec/tablet_sink.h index 277ecef..9b58548 100644 --- a/be/src/exec/tablet_sink.h +++ b/be/src/exec/tablet_sink.h @@ -17,6 +17,7 @@ #pragma once +#include <fmt/format.h> #include <memory> #include <queue> #include <set> @@ -164,7 +165,7 @@ public: Status mark_close(); Status close_wait(RuntimeState* state); - void cancel(); + void cancel(const std::string& cancel_msg); // return: // 0: stopped, send finished(eos request has been sent), or any internal error; @@ -191,8 +192,6 @@ public: } int64_t node_id() const { return _node_id; } - const NodeInfo* node_info() const { return _node_info; } - std::string print_load_info() const { return _load_info; } std::string name() const { return _name; } Status none_of(std::initializer_list<bool> vars); @@ -200,6 +199,13 @@ public: // TODO(HW): remove after mem tracker shared void clear_all_batches(); + std::string channel_info() const { + // FIXME(cmy): There is a problem that when calling node_info, the node_info seems not initialized. + // But I don't know why. so here I print node_info->id instead of node_info->host + // to avoid BE crash. It needs further observation. + return fmt::format("{}, {}, node={}:{}", _name, _load_info, _node_info->id, _node_info->brpc_port); + } + private: void _cancel_with_msg(const std::string& msg); diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp index a5cb6de..3868e88 100644 --- a/be/src/olap/delta_writer.cpp +++ b/be/src/olap/delta_writer.cpp @@ -180,6 +180,9 @@ OLAPStatus DeltaWriter::write(Tuple* tuple) { } OLAPStatus DeltaWriter::_flush_memtable_async() { + if (++_segment_counter > config::max_segment_num_per_rowset) { + return OLAP_ERR_TOO_MANY_SEGMENTS; + } return _flush_token->submit(_mem_table); } diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 1c5e511..f395227 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -93,11 +93,12 @@ OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) { read_options.use_page_cache = read_context->use_page_cache; // load segments - RETURN_NOT_OK(SegmentLoader::instance()->load_segments(_rowset, &_segment_cache_handle)); + RETURN_NOT_OK(SegmentLoader::instance()->load_segments( + _rowset, &_segment_cache_handle, read_context->reader_type == ReaderType::READER_QUERY)); // create iterator for each segment std::vector<std::unique_ptr<RowwiseIterator>> seg_iterators; - for (auto& seg_ptr : _segment_cache_handle.value()->segments) { + for (auto& seg_ptr : _segment_cache_handle.get_segments()) { std::unique_ptr<RowwiseIterator> iter; auto s = seg_ptr->new_iterator(schema, read_options, _parent_tracker, &iter); if (!s.ok()) { diff --git a/be/src/olap/segment_loader.cpp b/be/src/olap/segment_loader.cpp index 3a50dc9..872279f 100644 --- a/be/src/olap/segment_loader.cpp +++ b/be/src/olap/segment_loader.cpp @@ -56,23 +56,27 @@ void SegmentLoader::_insert(const SegmentLoader::CacheKey& key, SegmentLoader::C } OLAPStatus SegmentLoader::load_segments(const BetaRowsetSharedPtr& rowset, - SegmentCacheHandle* cache_handle) { - SegmentCacheHandle handle; + SegmentCacheHandle* cache_handle, + bool use_cache) { SegmentLoader::CacheKey cache_key(rowset->rowset_id()); - if (_lookup(cache_key, &handle)) { - *cache_handle = std::move(handle); + if (_lookup(cache_key, cache_handle)) { + cache_handle->owned = false; return OLAP_SUCCESS; } + cache_handle->owned = !use_cache; std::vector<segment_v2::SegmentSharedPtr> segments; RETURN_NOT_OK(rowset->load_segments(&segments)); - // memory of SegmentLoader::CacheValue will be handled by SegmentLoader - SegmentLoader::CacheValue* cache_value = new SegmentLoader::CacheValue(); - cache_value->segments = std::move(segments); - _insert(cache_key, *cache_value, &handle); - *cache_handle = std::move(handle); - + if (use_cache) { + // memory of SegmentLoader::CacheValue will be handled by SegmentLoader + SegmentLoader::CacheValue* cache_value = new SegmentLoader::CacheValue(); + cache_value->segments = std::move(segments); + _insert(cache_key, *cache_value, cache_handle); + } else { + cache_handle->segments = std::move(segments); + } + return OLAP_SUCCESS; } diff --git a/be/src/olap/segment_loader.h b/be/src/olap/segment_loader.h index 8a2bba4..2a75efa 100644 --- a/be/src/olap/segment_loader.h +++ b/be/src/olap/segment_loader.h @@ -83,8 +83,9 @@ public: SegmentLoader(size_t capacity); - // Load segments of "rowset" from _cache, return the "cache_handle" which contains segments - OLAPStatus load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle); + // Load segments of "rowset", return the "cache_handle" which contains segments. + // If use_cache is true, it will be loaded from _cache. + OLAPStatus load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle, bool use_cache = false); // Try to prune the segment cache if expired. OLAPStatus prune(); @@ -121,9 +122,12 @@ public: ~SegmentCacheHandle() { if (_handle != nullptr) { + CHECK(_cache != nullptr); + CHECK(segments.empty()) << segments.size(); + CHECK(!owned); // last_visit_time is set when release. // because it only be needed when pruning. - value()->last_visit_time = UnixMillis(); + ((SegmentLoader::CacheValue*) _cache->value(_handle))->last_visit_time = UnixMillis(); _cache->release(_handle); } } @@ -131,15 +135,30 @@ public: SegmentCacheHandle(SegmentCacheHandle&& other) noexcept { std::swap(_cache, other._cache); std::swap(_handle, other._handle); + this->owned = other.owned; + this->segments = std::move(other.segments); } SegmentCacheHandle& operator=(SegmentCacheHandle&& other) noexcept { std::swap(_cache, other._cache); std::swap(_handle, other._handle); + this->owned = other.owned; + this->segments = std::move(other.segments); return *this; } - SegmentLoader::CacheValue* value() { return (SegmentLoader::CacheValue*) _cache->value(_handle); } + std::vector<segment_v2::SegmentSharedPtr>& get_segments() { + if (owned) { + return segments; + } else { + return ((SegmentLoader::CacheValue*) _cache->value(_handle))->segments; + } + } + +public: + // If set to true, the loaded segments will be saved in segments, not in lru cache; + bool owned = false; + std::vector<segment_v2::SegmentSharedPtr> segments; private: Cache* _cache = nullptr; diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js index d5adc11..cbab2d2 100644 --- a/docs/.vuepress/sidebar/en.js +++ b/docs/.vuepress/sidebar/en.js @@ -214,9 +214,10 @@ module.exports = [ sidebarDepth: 1, }, { - title: "Bast Practices", + title: "Benchmark & Bast Practices", directoryPath: "best-practices/", children: [ + "star-schema-benchmark", "fe-load-balance", "systemd", "samples", diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js index e1ba2b2..4b75292 100644 --- a/docs/.vuepress/sidebar/zh-CN.js +++ b/docs/.vuepress/sidebar/zh-CN.js @@ -215,9 +215,10 @@ module.exports = [ sidebarDepth: 1, }, { - title: "最佳实践", + title: "性能测试和最佳实践", directoryPath: "best-practices/", children: [ + "star-schema-benchmark", "fe-load-balance", "systemd", "samples", diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index 4672101..cfbe04b 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1433,7 +1433,7 @@ The size of the buffer before flashing * Type: int32 * Description: Used to limit the number of segments in the newly generated rowset when importing. If the threshold is exceeded, the import will fail with error -238. Too many segments will cause compaction to take up a lot of memory and cause OOM errors. -* Default value: 100 +* Default value: 200 ### `remote_storage_read_buffer_mb` diff --git a/docs/en/best-practices/star-schema-benchmark.md b/docs/en/best-practices/star-schema-benchmark.md new file mode 100644 index 0000000..f0ea9e1 --- /dev/null +++ b/docs/en/best-practices/star-schema-benchmark.md @@ -0,0 +1,163 @@ +--- +{ + "title": "Star-Schema-Benchmark", + "language": "en" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +# Star Schema Benchmark + +[Star Schema Benchmark(SSB)](https://www.cs.umb.edu/~poneil/StarSchemaB.PDF) is a lightweight data warehouse scenario performance test set. Based on [TPC-H](http://www.tpc.org/tpch/), SSB provides a simplified version of the star model data set, which is mainly used to test the performance of multi-table association queries under the star model. + +This document mainly introduces how to pass the preliminary performance test of the SSB process in Doris. + +> Note 1: The standard test set including SSB is usually far from the actual business scenario, and some tests will perform parameter tuning for the test set. Therefore, the test results of the standard test set can only reflect the performance of the database in a specific scenario. It is recommended that users use actual business data for further testing. +> +> Note 2: The operations involved in this document are all performed in the CentOS 7 environment. + +## Environmental preparation + +Please refer to the [official document](http://doris.incubator.apache.org/master/en/installing/install-deploy.html) to install and deploy Doris to obtain a normal running Doris cluster ( Contain at least 1 FE, 1 BE). + +The scripts involved in the following documents are all stored under `tools/ssb-tools/` in the Doris code base. + +## data preparation + +### 1. Download and install the SSB data generation tool. + +Execute the following script to download and compile the [ssb-dbgen](https://github.com/electrum/ssb-dbgen.git) tool. + +``` +sh build-ssb-dbgen.sh +``` + +After the installation is successful, the `dbgen` binary file will be generated in the `ssb-dbgen/` directory. + +### 2. Generate SSB test set + +Execute the following script to generate the SSB data set: + +``` +sh gen-ssb-data.sh -s 100 -c 100 +``` + +> Note 1: `sh gen-ssb-data.sh -h View help` +> +> Note 2: The data will be generated under the directory `ssb-data/` with a suffix of `.tbl`. The total file size is about 60GB. The generation time may vary from a few minutes to an hour. +> +> Note 3: `-s 100` means that the test set size factor is 100, `-c 100` means that 100 threads concurrently generate data in the lineorder table. The `-c` parameter also determines the number of files in the final lineorder table. The larger the parameter, the more files and the smaller each file. + +Under the `-s 100` parameter, the generated data set size is: + +|Table |Rows |Size | File Number | +|---|---|---|---| +|lineorder| 600 million (600037902) | 60GB | 100| +|customer|30 million (3000000) |277M |1| +|part|1.4 million (1400000) | 116M|1| +|supplier|200,000 (200,000) |17M |1| +|date| 2556|228K |1| + +3. Build a table + + Copy the table creation statement in [create-tables.sql](https://github.com/apache/incubator-doris/tree/master/tools/ssb-tools/create-tables.sql) and execute it in Doris. + +4. Import data + + 1. Import 4 dimension table data (customer, part, supplier and date) + + Because the data volume of these 4 dimension tables is small, and the import is simpler, we use the following command to import the data of these 4 tables first: + + `sh load-dimension-data.sh` + + 2. Import the fact table lineorder. + + Import the lineorder table data with the following command: + + `sh load-fact-data.sh -c 3` + + `-c 3` means to start 5 concurrent threads to import (the default is 3). In the case of a single BE node, the import time of lineorder data generated by `sh gen-ssb-data.sh -s 100 -c 100` using `sh load-fact-data.sh -c 3` is about 10 minutes. The memory overhead is about 5-6GB. If you turn on more threads, you can speed up the import speed, but it will increase additional memory overhead. + +5. Check the imported data + + ``` + select count(*) from part; + select count(*) from customer; + select count(*) from supplier; + select count(*) from date; + select count(*) from lineorder; + ``` + + The amount of data should be the same as the number of rows of generated data. + +## Query test + +There are 4 groups of 14 SQL in the SSB test set. The query statement is in the [queries/](https://github.com/apache/incubator-doris/tree/master/tools/ssb-tools/queries) directory. + +## testing report + +The following test report is based on Doris [branch-0.15](https://github.com/apache/incubator-doris/tree/branch-0.15) branch code test, for reference only. (Update time: October 25, 2021) + +1. Hardware environment + + * 1 FE + 1-3 BE mixed + * CPU: 96core, Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz + * Memory: 384GB + * Hard disk: 1 HDD + * Network card: 10 Gigabit network card + +2. Data set + + |Table |Rows |Origin Size | Compacted Size(1 Replica) | + |---|---|---|---| + |lineorder| 600 million (600037902) | 60 GB | 14.846 GB | + |customer|30 million (3000000) |277 MB | 414.741 MB | + |part|1.4 million (1.400000) | 116 MB | 38.277 MB | + |supplier|200,000 (200,000) |17 MB | 27.428 MB | + |date| 2556|228 KB | 275.804 KB | + +3. Test results + + |Query |Time(ms) (1 BE) | Time(ms) (3 BE) | Parallelism | Runtime Filter Mode | + |---|---|---|---|---| + | q1.1 | 200 | 140 | 8 | IN | + | q1.2 | 90 | 80 | 8 | IN | + | q1.3 | 90 | 80 | 8 | IN | + | q2.1 | 1100 | 400 | 8 | BLOOM_FILTER | + | q2.2 | 900 | 330 | 8 | BLOOM_FILTER | + | q2.3 | 790 | 320 | 8 | BLOOM_FILTER | + | q3.1 | 3100 | 1280 | 8 | BLOOM_FILTER | + | q3.2 | 700 | 270 | 8 | BLOOM_FILTER | + | q3.3 | 540 | 270 | 8 | BLOOM_FILTER | + | q3.4 | 560 | 240 | 8 | BLOOM_FILTER | + | q4.1 | 2820 | 1150 | 8 | BLOOM_FILTER | + | q4.2 | 1430 | 670 | 8 | BLOOM_FILTER | + | q4.2 | 1750 | 1030 | 8 | BLOOM_FILTER | + + > Note 1: "This test set is far from your generation environment, don't be superstitious!" + > + > Note 2: The test result is the average value of multiple executions (Page Cache will play a certain acceleration role). And the data has undergone sufficient compaction (if you test immediately after importing the data, the query delay may be higher than the test result) + > + > Note 3: Due to environmental constraints, the hardware specifications used in this test are relatively high, but so many hardware resources will not be consumed during the entire test. The memory consumption is within 10GB, and the CPU usage is within 10%. + > + > Note 4: Parallelism means query concurrency, which is set by `set parallel_fragment_exec_instance_num=8`. + > + > Note 5: Runtime Filter Mode is the type of Runtime Filter, set by `set runtime_filter_type="BLOOM_FILTER"`. ([Runtime Filter](http://doris.incubator.apache.org/master/en/administrator-guide/runtime-filter.html) function has a significant effect on the SSB test set. Because in this test level, Join is calculated The data in the sub-right table can filter the left table very well. You can try to turn off this function through `set runtime_filter_mode=off` to see the change in query l [...] diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index fc4d893..5907a3e 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1454,7 +1454,7 @@ webserver默认工作线程数 * 类型: int32 * 描述: 用于限制导入时,新产生的rowset中的segment数量。如果超过阈值,导入会失败并报错 -238。过多的 segment 会导致compaction占用大量内存引发 OOM 错误。 -* 默认值: 100 +* 默认值: 200 ### `remote_storage_read_buffer_mb` diff --git a/docs/zh-CN/best-practices/star-schema-benchmark.md b/docs/zh-CN/best-practices/star-schema-benchmark.md new file mode 100644 index 0000000..30f43aa --- /dev/null +++ b/docs/zh-CN/best-practices/star-schema-benchmark.md @@ -0,0 +1,164 @@ +--- +{ + "title": "Star-Schema-Benchmark 测试", + "language": "zh-CN" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +# Star Schema Benchmark + +[Star Schema Benchmark(SSB)](https://www.cs.umb.edu/~poneil/StarSchemaB.PDF) 是一个轻量级的数仓场景下的性能测试集。SSB基于 [TPC-H](http://www.tpc.org/tpch/) 提供了一个简化版的星形模型数据集,主要用于测试在星形模型下,多表关联查询的性能表现。 + +本文档主要介绍如何在 Doris 中通过 SSB 进程初步的性能测试。 + +> 注1:包括 SSB 在内标准测试集通常和实际业务场景差距较大,并且部分测试会针对测试集进行参数调优。所以标准测试集的测试结果仅能反映数据库在特定场景下的性能表现。建议用户使用实际业务数据进行进一步的测试。 +> +> 注2:本文档涉及的操作都在 CentOS 7 环境进行。 + +## 环境准备 + +请先参照 [官方文档](http://doris.incubator.apache.org/master/zh-CN/installing/install-deploy.html) 进行 Doris 的安装部署,以获得一个正常运行中的 Doris 集群(至少包含 1 FE,1 BE)。 + +以下文档中涉及的的脚本都存放在 Doris 代码库的 `tools/ssb-tools/` 下。 + +## 数据准备 + +### 1. 下载安装 SSB 数据生成工具。 + +执行以下脚本下载并编译 [ssb-dbgen](https://github.com/electrum/ssb-dbgen.git) 工具。 + +``` +sh build-ssb-dbgen.sh +``` + +安装成功后,将在 `ssb-dbgen/` 目录下生成 `dbgen` 二进制文件。 + +### 2. 生成 SSB 测试集 + +执行以下脚本生成 SSB 数据集: + +``` +sh gen-ssb-data.sh -s 100 -c 100 +``` + +> 注1:`sh gen-ssb-data.sh -h 查看帮助` +> +> 注2:数据会以 `.tbl` 为后缀生成在 `ssb-data/` 目录下。文件总大小约60GB。生成时间可能在数分钟到1小时不等。 +> +> 注3:`-s 100` 表示测试集大小系数为 100,`-c 100` 表示并发100个线程生成 lineorder 表的数据。`-c` 参数也决定了最终 lineorder 表的文件数量。参数越大,文件数越多,每个文件越小。 + +在 `-s 100` 参数下,生成的数据集大小为: + +|Table |Rows |Size | File Number | +|---|---|---|---| +|lineorder| 6亿(600037902) | 60GB | 100| +|customer|3千万(3000000) |277M |1| +|part|140万(1400000) | 116M|1| +|supplier|20万(200000) |17M |1| +|date| 2556|228K |1| + +3. 建表 + + 复制 [create-tables.sql](https://github.com/apache/incubator-doris/tree/master/tools/ssb-tools/create-tables.sql) 中的建表语句,在 Doris 中执行。 + +4. 导入数据 + + 1. 导入 4 张维度表数据(customer, part, supplier and date) + + 因为这4张维表数据量较小,导入较简答,我们使用以下命令先导入这4表的数据: + + `sh load-dimension-data.sh` + + 2. 导入事实表 lineorder。 + + 通过以下命令导入 lineorder 表数据: + + `sh load-fact-data.sh -c 3` + + `-c 3` 表示启动 5 个并发线程导入(默认为3)。在单 BE 节点情况下,由 `sh gen-ssb-data.sh -s 100 -c 100` 生成的 lineorder 数据,使用 `sh load-fact-data.sh -c 3` 的导入时间约为 10min。内存开销约为 5-6GB。如果开启更多线程,可以加快导入速度,但会增加额外的内存开销。 + +5. 检查导入数据 + + ``` + select count(*) from part; + select count(*) from customer; + select count(*) from supplier; + select count(*) from date; + select count(*) from lineorder; + ``` + + 数据量应和生成数据的行数一致。 + +## 查询测试 + +SSB 测试集共 4 组 14 个 SQL。查询语句在 [queries/](https://github.com/apache/incubator-doris/tree/master/tools/ssb-tools/queries) 目录下。 + +## 测试报告 + +以下测试报告基于 Doris [branch-0.15](https://github.com/apache/incubator-doris/tree/branch-0.15) 分支代码测试,仅供参考。(更新时间:2021年10月25号) + +1. 硬件环境 + + * 1 FE + 1-3 BE 混部 + * CPU:96core, Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz + * 内存:384GB + * 硬盘:1块机械硬盘 + * 网卡:万兆网卡 + +2. 数据集 + + |Table |Rows |Origin Size | Compacted Size(1 Replica) | + |---|---|---|---| + |lineorder| 6亿(600037902) | 60 GB | 14.846 GB | + |customer|3千万(3000000) |277 MB | 414.741 MB | + |part|140万(1400000) | 116 MB | 38.277 MB | + |supplier|20万(200000) |17 MB | 27.428 MB | + |date| 2556|228 KB | 275.804 KB | + +3. 测试结果 + + |Query |Time(ms) (1 BE) | Time(ms) (3 BE) | Parallelism | Runtime Filter Mode | + |---|---|---|---|---| + | q1.1 | 200 | 140 | 8 | IN | + | q1.2 | 90 | 80 | 8 | IN | + | q1.3 | 90 | 80 | 8 | IN | + | q2.1 | 1100 | 400 | 8 | BLOOM_FILTER | + | q2.2 | 900 | 330 | 8 | BLOOM_FILTER | + | q2.3 | 790 | 320 | 8 | BLOOM_FILTER | + | q3.1 | 3100 | 1280 | 8 | BLOOM_FILTER | + | q3.2 | 700 | 270 | 8 | BLOOM_FILTER | + | q3.3 | 540 | 270 | 8 | BLOOM_FILTER | + | q3.4 | 560 | 240 | 8 | BLOOM_FILTER | + | q4.1 | 2820 | 1150 | 8 | BLOOM_FILTER | + | q4.2 | 1430 | 670 | 8 | BLOOM_FILTER | + | q4.2 | 1750 | 1030 | 8 | BLOOM_FILTER | + + > 注1:“这个测试集和你的生成环境相去甚远,不要迷信他!” + > + > 注2:测试结果为多次执行取平均值(Page Cache 会起到一定加速作用)。并且数据经过充分的 compaction (如果在刚导入数据后立刻测试,则查询延迟可能高于本测试结果) + > + > 注3:因环境受限,本测试使用的硬件规格较高,但整个测试过程中不会消耗如此多的硬件资源。其中内存消耗在 10GB 以内,CPU使用率在 10% 以内。 + > + > 注4:Parallelism 表示查询并发度,通过 `set parallel_fragment_exec_instance_num=8` 设置。 + > + > 注5:Runtime Filter Mode 是 Runtime Filter 的类型,通过 `set runtime_filter_type="BLOOM_FILTER"` 设置。([Runtime Filter](http://doris.incubator.apache.org/master/zh-CN/administrator-guide/runtime-filter.html) 功能对 SSB 测试集效果显著。因为该测试级中,Join 算子右表的数据可以对左表起到很好的过滤作用。你可以尝试通过 `set runtime_filter_mode=off` 关闭该功能,看看查询延迟的变化。) + diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java index d79363a..cd02ec1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java @@ -1376,7 +1376,7 @@ public class Config extends ConfigBase { * Whether to add a delete sign column when create unique table */ @ConfField(mutable = true, masterOnly = true) - public static boolean enable_batch_delete_by_default = false; + public static boolean enable_batch_delete_by_default = true; /** * Used to set default db data quota bytes. diff --git a/tools/ssb-tools/build-ssb-dbgen.sh b/tools/ssb-tools/build-ssb-dbgen.sh new file mode 100755 index 0000000..59af467 --- /dev/null +++ b/tools/ssb-tools/build-ssb-dbgen.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to build ssb-dbgen +# sssb-dbgen's source code is from https://github.com/electrum/ssb-dbgen.git +# Usage: +# sh build-ssb-dbgen.sh +############################################################## + +set -eo pipefail + +ROOT=`dirname "$0"` +ROOT=`cd "$ROOT"; pwd` + +CURDIR=${ROOT} +SSB_DBGEN_DIR=$CURDIR/ssb-dbgen/ + +# download ssb-dbgen first +if [[ -d $SSB_DBGEN_DIR ]]; then + echo "Dir $CURDIR/ssb-dbgen/ already exists. No need to download." + echo "If you want to download ssb-dbgen again, please delete this dir first." +else + curl https://palo-cloud-repo-bd.bd.bcebos.com/baidu-doris-release/ssb-dbgen-linux.tar.gz | tar xz -C $CURDIR/ +fi + +# compile ssb-dbgen +cd $SSB_DBGEN_DIR/ && make +cd - + +# check +if [[ -f $CURDIR/ssb-dbgen/dbgen ]]; then + echo "Build succeed! Run $CURDIR/ssb-dbgen/dbgen -h" + exit 0 +else + echo "Build failed!" + exit 1 +fi diff --git a/tools/ssb-tools/create-tables.sql b/tools/ssb-tools/create-tables.sql new file mode 100644 index 0000000..fc971b1 --- /dev/null +++ b/tools/ssb-tools/create-tables.sql @@ -0,0 +1,117 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +CREATE TABLE `lineorder` ( + `lo_orderkey` bigint(20) NULL COMMENT "", + `lo_linenumber` bigint(20) NULL COMMENT "", + `lo_custkey` int(11) NULL COMMENT "", + `lo_partkey` int(11) NULL COMMENT "", + `lo_suppkey` int(11) NULL COMMENT "", + `lo_orderdate` int(11) NULL COMMENT "", + `lo_orderpriority` varchar(16) NULL COMMENT "", + `lo_shippriority` int(11) NULL COMMENT "", + `lo_quantity` bigint(20) NULL COMMENT "", + `lo_extendedprice` bigint(20) NULL COMMENT "", + `lo_ordtotalprice` bigint(20) NULL COMMENT "", + `lo_discount` bigint(20) NULL COMMENT "", + `lo_revenue` bigint(20) NULL COMMENT "", + `lo_supplycost` bigint(20) NULL COMMENT "", + `lo_tax` bigint(20) NULL COMMENT "", + `lo_commitdate` bigint(20) NULL COMMENT "", + `lo_shipmode` varchar(11) NULL COMMENT "" +) +PARTITION BY RANGE(`lo_orderdate`) +(PARTITION p1992 VALUES [("-2147483648"), ("19930101")), +PARTITION p1993 VALUES [("19930101"), ("19940101")), +PARTITION p1994 VALUES [("19940101"), ("19950101")), +PARTITION p1995 VALUES [("19950101"), ("19960101")), +PARTITION p1996 VALUES [("19960101"), ("19970101")), +PARTITION p1997 VALUES [("19970101"), ("19980101")), +PARTITION p1998 VALUES [("19980101"), ("19990101"))) +DISTRIBUTED BY HASH(`lo_orderkey`) BUCKETS 48 +PROPERTIES ( +"replication_num" = "1" +); + +CREATE TABLE `customer` ( + `c_custkey` int(11) NULL COMMENT "", + `c_name` varchar(26) NULL COMMENT "", + `c_address` varchar(41) NULL COMMENT "", + `c_city` varchar(11) NULL COMMENT "", + `c_nation` varchar(16) NULL COMMENT "", + `c_region` varchar(13) NULL COMMENT "", + `c_phone` varchar(16) NULL COMMENT "", + `c_mktsegment` varchar(11) NULL COMMENT "" +) +DISTRIBUTED BY HASH(`c_custkey`) BUCKETS 10 +PROPERTIES ( +"replication_num" = "1" +); + +CREATE TABLE `date` ( + `d_datekey` int(11) NULL COMMENT "", + `d_date` varchar(20) NULL COMMENT "", + `d_dayofweek` varchar(10) NULL COMMENT "", + `d_month` varchar(11) NULL COMMENT "", + `d_year` int(11) NULL COMMENT "", + `d_yearmonthnum` int(11) NULL COMMENT "", + `d_yearmonth` varchar(9) NULL COMMENT "", + `d_daynuminweek` int(11) NULL COMMENT "", + `d_daynuminmonth` int(11) NULL COMMENT "", + `d_daynuminyear` int(11) NULL COMMENT "", + `d_monthnuminyear` int(11) NULL COMMENT "", + `d_weeknuminyear` int(11) NULL COMMENT "", + `d_sellingseason` varchar(14) NULL COMMENT "", + `d_lastdayinweekfl` int(11) NULL COMMENT "", + `d_lastdayinmonthfl` int(11) NULL COMMENT "", + `d_holidayfl` int(11) NULL COMMENT "", + `d_weekdayfl` int(11) NULL COMMENT "" +) +DISTRIBUTED BY HASH(`d_datekey`) BUCKETS 1 +PROPERTIES ( +"replication_num" = "1" +); + +CREATE TABLE `part` ( + `p_partkey` int(11) NULL COMMENT "", + `p_name` varchar(23) NULL COMMENT "", + `p_mfgr` varchar(7) NULL COMMENT "", + `p_category` varchar(8) NULL COMMENT "", + `p_brand` varchar(10) NULL COMMENT "", + `p_color` varchar(12) NULL COMMENT "", + `p_type` varchar(26) NULL COMMENT "", + `p_size` int(11) NULL COMMENT "", + `p_container` varchar(11) NULL COMMENT "" +) +DISTRIBUTED BY HASH(`p_partkey`) BUCKETS 10 +PROPERTIES ( +"replication_num" = "1" +); + +CREATE TABLE `supplier` ( + `s_suppkey` int(11) NULL COMMENT "", + `s_name` varchar(26) NULL COMMENT "", + `s_address` varchar(26) NULL COMMENT "", + `s_city` varchar(11) NULL COMMENT "", + `s_nation` varchar(16) NULL COMMENT "", + `s_region` varchar(13) NULL COMMENT "", + `s_phone` varchar(16) NULL COMMENT "" +) +DISTRIBUTED BY HASH(`s_suppkey`) BUCKETS 10 +PROPERTIES ( +"replication_num" = "1" +); diff --git a/tools/ssb-tools/doris-cluster.conf b/tools/ssb-tools/doris-cluster.conf new file mode 100644 index 0000000..bef6c7b --- /dev/null +++ b/tools/ssb-tools/doris-cluster.conf @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Any of FE host +export FE_HOST='127.0.0.1' +# http_port in fe.conf +export FE_HTTP_PORT=8030 +# Doris username +export USER='admin' +# Doris password +export PASSWORD='' +# The database where SSB tables located +export DB='db1' diff --git a/tools/ssb-tools/gen-ssb-data.sh b/tools/ssb-tools/gen-ssb-data.sh new file mode 100755 index 0000000..594f129 --- /dev/null +++ b/tools/ssb-tools/gen-ssb-data.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to generate ssb data set +############################################################## + +set -eo pipefail + +ROOT=`dirname "$0"` +ROOT=`cd "$ROOT"; pwd` + +CURDIR=${ROOT} +SSB_DBGEN_DIR=$CURDIR/ssb-dbgen/ +SSB_DATA_DIR=$CURDIR/ssb-data/ + +usage() { + echo " +Usage: $0 <options> + Optional options: + -s scale factor, default is 100 + -c parallelism to generate data of lineorder table, default is 10 + + Eg. + $0 generate data using default value. + $0 -s 10 generate data with scale factor 10. + $0 -s 10 -c 5 generate data with scale factor 10. And using 5 threads to generate data concurrently. + " + exit 1 +} + +OPTS=$(getopt \ + -n $0 \ + -o '' \ + -o 'hs:c:' \ + -- "$@") + +eval set -- "$OPTS" + +SCALE_FACTOR=100 +PARALLEL=10 +HELP=0 + +if [ $# == 0 ] ; then + usage +fi + +while true; do + case "$1" in + -h) HELP=1 ; shift ;; + -s) SCALE_FACTOR=$2 ; shift 2 ;; + -c) PARALLEL=$2 ; shift 2 ;; + --) shift ; break ;; + *) echo "Internal error" ; exit 1 ;; + esac +done + +if [[ ${HELP} -eq 1 ]]; then + usage + exit +fi + +echo "Scale Factor: $SCALE_FACTOR" +echo "Parallelism: $PARALLEL" + +# check if dbgen exists +if [[ ! -f $SSB_DBGEN_DIR/dbgen ]]; then + echo "$SSB_DBGEN_DIR/dbgen does not exist. Run build-ssb-dbgen.sh first to build it first." + exit 1 +fi + +if [[ -d $SSB_DATA_DIR/ ]]; then + echo "$SSB_DATA_DIR exists. Remove it before generating data" + exit 1 +fi + +mkdir $SSB_DATA_DIR/ + +# gen data +cd $SSB_DBGEN_DIR +echo "Begin to generate data for table: customer" +$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T c +echo "Begin to generate data for table: part" +$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T p +echo "Begin to generate data for table: supplier" +$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T s +echo "Begin to generate data for table: date" +$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T d +echo "Begin to generate data for table: lineorder" +$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T l -C $PARALLEL +cd - + +# move data to $SSB_DATA_DIR +mv $SSB_DBGEN_DIR/*.tbl* $SSB_DATA_DIR/ + +# check data +du -sh $SSB_DATA_DIR/*.tbl* diff --git a/tools/ssb-tools/load-dimension-data.sh b/tools/ssb-tools/load-dimension-data.sh new file mode 100755 index 0000000..f24ad3a --- /dev/null +++ b/tools/ssb-tools/load-dimension-data.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to load generated ssb data set to Doris +# Only for 4 dimension tables: customer, part, supplier and date. +# Usage: +# sh load-dimension-data.sh +############################################################## + +set -eo pipefail + +ROOT=`dirname "$0"` +ROOT=`cd "$ROOT"; pwd` + +CURDIR=${ROOT} +SSB_DATA_DIR=$CURDIR/ssb-data/ + +# check if ssb-data exists +if [[ ! -d $SSB_DATA_DIR/ ]]; then + echo "$SSB_DATA_DIR does not exist. Run sh gen-ssb-data.sh first." + exit 1 +fi + +check_prerequest() { + local CMD=$1 + local NAME=$2 + if ! $CMD; then + echo "$NAME is missing. This script depends on cURL to load data to Doris." + exit 1 + fi +} + +check_prerequest "curl --version" "curl" + +# load 4 small dimension tables + +source $CURDIR/doris-cluster.conf + +echo "FE_HOST: $FE_HOST" +echo "FE_HTTP_PORT: $FE_HTTP_PORT" +echo "USER: $USER" +echo "PASSWORD: $PASSWORD" +echo "DB: $DB" + +echo 'Loading data for table: part' +curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" -H "columns:p_partkey,p_name,p_mfgr,p_category,p_brand,p_color,p_type,p_size,p_container,p_dummy" -T $SSB_DATA_DIR/part.tbl http://$FE_HOST:$FE_HTTP_PORT/api/$DB/part/_stream_load +echo 'Loading data for table: date' +curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" -H "columns:d_datekey,d_date,d_dayofweek,d_month,d_year,d_yearmonthnum,d_yearmonth,d_daynuminweek,d_daynuminmonth,d_daynuminyear,d_monthnuminyear,d_weeknuminyear,d_sellingseason,d_lastdayinweekfl,d_lastdayinmonthfl,d_holidayfl,d_weekdayfl,d_dummy" -T $SSB_DATA_DIR/date.tbl http://$FE_HOST:$FE_HTTP_PORT/api/$DB/date/_stream_load +echo 'Loading data for table: supplier' +curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" -H "columns:s_suppkey,s_name,s_address,s_city,s_nation,s_region,s_phone,s_dummy" -T $SSB_DATA_DIR/supplier.tbl http://$FE_HOST:$FE_HTTP_PORT/api/$DB/supplier/_stream_load +echo 'Loading data for table: customer' +curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" -H "columns:c_custkey,c_name,c_address,c_city,c_nation,c_region,c_phone,c_mktsegment,no_use" -T $SSB_DATA_DIR/customer.tbl http://$FE_HOST:$FE_HTTP_PORT/api/$DB/customer/_stream_load diff --git a/tools/ssb-tools/load-fact-data.sh b/tools/ssb-tools/load-fact-data.sh new file mode 100755 index 0000000..24bc7f3 --- /dev/null +++ b/tools/ssb-tools/load-fact-data.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to load generated ssb data set to Doris +# Only for 1 fact table: lineorder +############################################################## + +set -eo pipefail + +ROOT=`dirname "$0"` +ROOT=`cd "$ROOT"; pwd` + +CURDIR=${ROOT} +SSB_DATA_DIR=$CURDIR/ssb-data/ + +usage() { + echo " +Usage: $0 <options> + Optional options: + -c parallelism to load data of lineorder table, default is 5. + + Eg. + $0 load data using default value. + $0 -c 10 load lineorder table data using parallelism 10. + " + exit 1 +} + +OPTS=$(getopt \ + -n $0 \ + -o '' \ + -o 'c:' \ + -- "$@") + +eval set -- "$OPTS" + +PARALLEL=3 +HELP=0 + +if [ $# == 0 ] ; then + usage +fi + +while true; do + case "$1" in + -h) HELP=1 ; shift ;; + -c) PARALLEL=$2 ; shift 2 ;; + --) shift ; break ;; + *) echo "Internal error" ; exit 1 ;; + esac +done + +if [[ ${HELP} -eq 1 ]]; then + usage + exit +fi + +echo "Parallelism: $PARALLEL" + +# check if ssb-data exists +if [[ ! -d $SSB_DATA_DIR/ ]]; then + echo "$SSB_DATA_DIR does not exist. Run sh gen-ssb-data.sh first." + exit 1 +fi + +check_prerequest() { + local CMD=$1 + local NAME=$2 + if ! $CMD; then + echo "$NAME is missing. This script depends on cURL to load data to Doris." + exit 1 + fi +} + +check_prerequest "curl --version" "curl" + +# load lineorder +source $CURDIR/doris-cluster.conf + +echo "FE_HOST: $FE_HOST" +echo "FE_HTTP_PORT: $FE_HTTP_PORT" +echo "USER: $USER" +echo "PASSWORD: $PASSWORD" +echo "DB: $DB" + +function load() +{ + echo $@ + curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" -H "columns:lo_orderkey,lo_linenumber,lo_custkey,lo_partkey,lo_suppkey,lo_orderdate,lo_orderpriority,lo_shippriority,lo_quantity,lo_extendedprice,lo_ordtotalprice,lo_discount,lo_revenue,lo_supplycost,lo_tax,lo_commitdate,lo_shipmode,lo_dummy" -T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/lineorder/_stream_load +} + + +# set parallelism +[ -e /tmp/fd1 ] || mkfifo /tmp/fd1 +exec 3<>/tmp/fd1 +rm -rf /tmp/fd1 + +for ((i=1;i<=$PARALLEL;i++)) +do + echo >&3 +done + +for file in `ls $SSB_DATA_DIR/lineorder.tbl.*` +do + read -u3 + { + load $file + echo >&3 + }& +done diff --git a/tools/ssb-tools/queries/q1.1.sql b/tools/ssb-tools/queries/q1.1.sql new file mode 100644 index 0000000..4ef15e9 --- /dev/null +++ b/tools/ssb-tools/queries/q1.1.sql @@ -0,0 +1,24 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT SUM(lo_extendedprice*lo_discount) AS +REVENUE +FROM lineorder, date +WHERE lo_orderdate = d_datekey +AND d_year = 1993 +AND lo_discount BETWEEN 1 AND 3 +AND lo_quantity < 25; diff --git a/tools/ssb-tools/queries/q1.2.sql b/tools/ssb-tools/queries/q1.2.sql new file mode 100644 index 0000000..1b8442b --- /dev/null +++ b/tools/ssb-tools/queries/q1.2.sql @@ -0,0 +1,24 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT SUM(lo_extendedprice*lo_discount) AS +REVENUE +FROM lineorder, date +WHERE lo_orderdate = d_datekey +AND d_yearmonth = 'Jan1994' +AND lo_discount BETWEEN 4 AND 6 +AND lo_quantity BETWEEN 26 AND 35; diff --git a/tools/ssb-tools/queries/q1.3.sql b/tools/ssb-tools/queries/q1.3.sql new file mode 100644 index 0000000..ed6e51b --- /dev/null +++ b/tools/ssb-tools/queries/q1.3.sql @@ -0,0 +1,25 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT SUM(lo_extendedprice*lo_discount) AS +REVENUE +FROM lineorder, date +WHERE lo_orderdate = d_datekey +AND d_weeknuminyear= 6 +AND d_year = 1994 +AND lo_discount BETWEEN 5 AND 7 +AND lo_quantity BETWEEN 26 AND 35; diff --git a/tools/ssb-tools/queries/q2.1.sql b/tools/ssb-tools/queries/q2.1.sql new file mode 100644 index 0000000..e1a1f52 --- /dev/null +++ b/tools/ssb-tools/queries/q2.1.sql @@ -0,0 +1,26 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT SUM(lo_revenue), d_year, p_brand +FROM lineorder, date, part, supplier +WHERE lo_orderdate = d_datekey +AND lo_partkey = p_partkey +AND lo_suppkey = s_suppkey +AND p_category = 'MFGR#12' +AND s_region = 'AMERICA' +GROUP BY d_year, p_brand +ORDER BY d_year, p_brand; diff --git a/tools/ssb-tools/queries/q2.2.sql b/tools/ssb-tools/queries/q2.2.sql new file mode 100644 index 0000000..3db6170 --- /dev/null +++ b/tools/ssb-tools/queries/q2.2.sql @@ -0,0 +1,27 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT SUM(lo_revenue), d_year, p_brand +FROM lineorder, date, part, supplier +WHERE lo_orderdate = d_datekey +AND lo_partkey = p_partkey +AND lo_suppkey = s_suppkey +AND p_brand BETWEEN 'MFGR#2221' +AND 'MFGR#2228' +AND s_region = 'ASIA' +GROUP BY d_year, p_brand +ORDER BY d_year, p_brand; diff --git a/tools/ssb-tools/queries/q2.3.sql b/tools/ssb-tools/queries/q2.3.sql new file mode 100644 index 0000000..b70ca90 --- /dev/null +++ b/tools/ssb-tools/queries/q2.3.sql @@ -0,0 +1,26 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT SUM(lo_revenue), d_year, p_brand +FROM lineorder, date, part, supplier +WHERE lo_orderdate = d_datekey +AND lo_partkey = p_partkey +AND lo_suppkey = s_suppkey +AND p_brand = 'MFGR#2239' +AND s_region = 'EUROPE' +GROUP BY d_year, p_brand +ORDER BY d_year, p_brand; diff --git a/tools/ssb-tools/queries/q3.1.sql b/tools/ssb-tools/queries/q3.1.sql new file mode 100644 index 0000000..70f17d7 --- /dev/null +++ b/tools/ssb-tools/queries/q3.1.sql @@ -0,0 +1,28 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT c_nation, s_nation, d_year, +SUM(lo_revenue) AS REVENUE +FROM customer, lineorder, supplier, date +WHERE lo_custkey = c_custkey +AND lo_suppkey = s_suppkey +AND lo_orderdate = d_datekey +AND c_region = 'ASIA' +AND s_region = 'ASIA' +AND d_year >= 1992 AND d_year <= 1997 +GROUP BY c_nation, s_nation, d_year +ORDER BY d_year ASC, REVENUE DESC; diff --git a/tools/ssb-tools/queries/q3.2.sql b/tools/ssb-tools/queries/q3.2.sql new file mode 100644 index 0000000..a416fbe --- /dev/null +++ b/tools/ssb-tools/queries/q3.2.sql @@ -0,0 +1,28 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT c_city, s_city, d_year, sum(lo_revenue) +AS REVENUE +FROM customer, lineorder, supplier, date +WHERE lo_custkey = c_custkey +AND lo_suppkey = s_suppkey +AND lo_orderdate = d_datekey +AND c_nation = 'UNITED STATES' +AND s_nation = 'UNITED STATES' +AND d_year >= 1992 AND d_year <= 1997 +GROUP BY c_city, s_city, d_year +ORDER BY d_year ASC, REVENUE DESC; diff --git a/tools/ssb-tools/queries/q3.3.sql b/tools/ssb-tools/queries/q3.3.sql new file mode 100644 index 0000000..98e29b7 --- /dev/null +++ b/tools/ssb-tools/queries/q3.3.sql @@ -0,0 +1,30 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT c_city, s_city, d_year, SUM(lo_revenue) +AS REVENUE +FROM customer, lineorder, supplier, date +WHERE lo_custkey = c_custkey +AND lo_suppkey = s_suppkey +AND lo_orderdate = d_datekey +AND (c_city='UNITED KI1' +OR c_city='UNITED KI5') +AND (s_city='UNITED KI1' +OR s_city='UNITED KI5') +AND d_year >= 1992 AND d_year <= 1997 +GROUP BY c_city, s_city, d_year +ORDER BY d_year ASC, REVENUE DESC; diff --git a/tools/ssb-tools/queries/q3.4.sql b/tools/ssb-tools/queries/q3.4.sql new file mode 100644 index 0000000..65fe992 --- /dev/null +++ b/tools/ssb-tools/queries/q3.4.sql @@ -0,0 +1,30 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT c_city, s_city, d_year, SUM(lo_revenue) +AS REVENUE +FROM customer, lineorder, supplier, date +WHERE lo_custkey = c_custkey +AND lo_suppkey = s_suppkey +AND lo_orderdate = d_datekey +AND (c_city='UNITED KI1' +OR c_city='UNITED KI5') +AND (s_city='UNITED KI1' +OR s_city='UNITED KI5') +AND d_yearmonth = 'Dec1997' +GROUP BY c_city, s_city, d_year +ORDER BY d_year ASC, REVENUE DESC; diff --git a/tools/ssb-tools/queries/q4.1.sql b/tools/ssb-tools/queries/q4.1.sql new file mode 100644 index 0000000..bdcd730 --- /dev/null +++ b/tools/ssb-tools/queries/q4.1.sql @@ -0,0 +1,30 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT d_year, c_nation, +SUM(lo_revenue - lo_supplycost) AS PROFIT +FROM date, customer, supplier, part, lineorder +WHERE lo_custkey = c_custkey +AND lo_suppkey = s_suppkey +AND lo_partkey = p_partkey +AND lo_orderdate = d_datekey +AND c_region = 'AMERICA' +AND s_region = 'AMERICA' +AND (p_mfgr = 'MFGR#1' +OR p_mfgr = 'MFGR#2') +GROUP BY d_year, c_nation +ORDER BY d_year, c_nation; diff --git a/tools/ssb-tools/queries/q4.2.sql b/tools/ssb-tools/queries/q4.2.sql new file mode 100644 index 0000000..24c82cf --- /dev/null +++ b/tools/ssb-tools/queries/q4.2.sql @@ -0,0 +1,31 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT d_year, s_nation, p_category, +SUM(lo_revenue - lo_supplycost) AS PROFIT +FROM date, customer, supplier, part, lineorder +WHERE lo_custkey = c_custkey +AND lo_suppkey = s_suppkey +AND lo_partkey = p_partkey +AND lo_orderdate = d_datekey +AND c_region = 'AMERICA' +AND s_region = 'AMERICA' +AND (d_year = 1997 OR d_year = 1998) +AND (p_mfgr = 'MFGR#1' +OR p_mfgr = 'MFGR#2') +GROUP BY d_year, s_nation, p_category +ORDER BY d_year, s_nation, p_category; diff --git a/tools/ssb-tools/queries/q4.3.sql b/tools/ssb-tools/queries/q4.3.sql new file mode 100644 index 0000000..0dcc08b --- /dev/null +++ b/tools/ssb-tools/queries/q4.3.sql @@ -0,0 +1,29 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SELECT d_year, s_city, p_brand, +SUM(lo_revenue - lo_supplycost) AS PROFIT +FROM date, customer, supplier, part, lineorder +WHERE lo_custkey = c_custkey +AND lo_suppkey = s_suppkey +AND lo_partkey = p_partkey +AND lo_orderdate = d_datekey +AND s_nation = 'UNITED STATES' +AND (d_year = 1997 OR d_year = 1998) +AND p_category = 'MFGR#14' +GROUP BY d_year, s_city, p_brand +ORDER BY d_year, s_city, p_brand; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org