This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 6e7cd605fdf3ce95a5993676a87a0209ae671973 Author: yx91490 <[email protected]> AuthorDate: Wed Aug 17 13:08:06 2022 +0800 IMPALA-11504: Specializing DecimalUtil::GetScaleMultiplier<int256_t>(). Currently decimal-util.h didn't specialize DecimalUtil ::GetScaleMultiplier<int256_t>(), causing more performance loss when calculate Decimal16Value division. This template specialisation results in 5600X speedup approximately. Testing: - Ran existing jobs. - Add decimal-util-benchmark. Change-Id: I969e2977d51313e738f72c8246db003ae43a3782 Reviewed-on: http://gerrit.cloudera.org:8080/18861 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/benchmarks/CMakeLists.txt | 1 + be/src/benchmarks/decimal-util-benchmark.cc | 92 +++++++++++++++++ be/src/runtime/decimal-test.cc | 19 ++++ be/src/util/decimal-util.h | 153 ++++++++++++++++++++++------ 4 files changed, 235 insertions(+), 30 deletions(-) diff --git a/be/src/benchmarks/CMakeLists.txt b/be/src/benchmarks/CMakeLists.txt index f436cefee..2af5e9fe2 100644 --- a/be/src/benchmarks/CMakeLists.txt +++ b/be/src/benchmarks/CMakeLists.txt @@ -38,6 +38,7 @@ ADD_BE_BENCHMARK(bitmap-benchmark) ADD_BE_BENCHMARK(bit-packing-benchmark) ADD_BE_BENCHMARK(bloom-filter-benchmark) ADD_BE_BENCHMARK(bswap-benchmark) +ADD_BE_BENCHMARK(decimal-util-benchmark) ADD_BE_BENCHMARK(expr-benchmark) ADD_BE_BENCHMARK(free-lists-benchmark) ADD_BE_BENCHMARK(hash-benchmark) diff --git a/be/src/benchmarks/decimal-util-benchmark.cc b/be/src/benchmarks/decimal-util-benchmark.cc new file mode 100644 index 000000000..e1395175a --- /dev/null +++ b/be/src/benchmarks/decimal-util-benchmark.cc @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> +#include <vector> + +#include "util/benchmark.h" +#include "util/cpu-info.h" +#include "util/decimal-util.h" + +// Summary result (ran within a Docker container): +// +// Machine Info: AMD Ryzen 9 5950X 16-Core Processor +// ScaleMultiplierBenchmark: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile +// (relative) (relative) (relative) +// --------------------------------------------------------------------------------------------------------- +// raw 0.852 0.852 0.852 1X 1X 1X +// specialized 4.71e+03 4.76e+03 4.81e+03 5.53e+03X 5.59e+03X 5.64e+03X + +using namespace impala; + +namespace int256_scale_multiplier { + +void AddTestData(vector<int>* data, int n) { + for (int i = 0; i < n; i++) { + int m = rand() % DecimalUtil::INT256_SCALE_UPPER_BOUND; + data->push_back(m); + } +} + +/// Copied from Decimal::GetScaleMultiplier +static int256_t GetScaleMultiplier(int scale) { + int256_t result = 1; + for (int i = 0; i < scale; ++i) { + result *= 10; + } + return result; +} + +// A volatile variable to hold the return value of the benchmarked functions. +// Used to prevent the compiler from optimising out the calls. +static volatile int64_t volatile_var = 0; + +static void TestRawTemplateCall(int batch_size, void* d) { + vector<int>* data = reinterpret_cast<vector<int>*>(d); + for (int i = 0; i < batch_size; i++) { + for (int j = 0; j < data->size(); j++) { + auto m = GetScaleMultiplier(j); + volatile_var = m.convert_to<int64_t>(); + } + } +} + +static void TestSpecializedTemplateCall(int batch_size, void* d) { + vector<int>* data = reinterpret_cast<vector<int>*>(d); + for (int i = 0; i < batch_size; i++) { + for (int j = 0; j < data->size(); j++) { + auto m = DecimalUtil::GetScaleMultiplier<int256_t>(j); + volatile_var = m.convert_to<int64_t>(); + } + } +} + +} // namespace int256_scale_multiplier + +int main(int argc, char** argv) { + CpuInfo::Init(); + std::cout << Benchmark::GetMachineInfo() << std::endl; + vector<int> data; + int256_scale_multiplier::AddTestData(&data, 1000); + Benchmark int256_scale_multiplier_suite("ScaleMultiplierBenchmark"); + int256_scale_multiplier_suite.AddBenchmark( + "raw", int256_scale_multiplier::TestRawTemplateCall, &data); + int256_scale_multiplier_suite.AddBenchmark( + "specialized", int256_scale_multiplier::TestSpecializedTemplateCall, &data); + std::cout << int256_scale_multiplier_suite.Measure() << std::endl; + return 0; +} diff --git a/be/src/runtime/decimal-test.cc b/be/src/runtime/decimal-test.cc index c5aed53c1..34624b134 100644 --- a/be/src/runtime/decimal-test.cc +++ b/be/src/runtime/decimal-test.cc @@ -1004,5 +1004,24 @@ TEST(DecimalTest, PrecisionScaleValidation) { EXPECT_FALSE(ColumnType::ValidateDecimalParams(15, 16)); } +template <typename T> +static void TestGetScaleMultiplier(int scale_upper_bound, T overflow_val) { + T expect = 1; + for (int scale = 0; scale < scale_upper_bound; scale++) { + EXPECT_EQ(expect, DecimalUtil::GetScaleMultiplier<T>(scale)); + expect *= 10; + } + // test overflow + EXPECT_EQ(overflow_val, DecimalUtil::GetScaleMultiplier<T>(scale_upper_bound)); +} + +TEST(DecimalTest, GetScaleMultiplier) { + TestGetScaleMultiplier<int32_t>(DecimalUtil::INT32_SCALE_UPPER_BOUND, -1); + TestGetScaleMultiplier<int64_t>(DecimalUtil::INT64_SCALE_UPPER_BOUND, -1); + TestGetScaleMultiplier<int128_t>(DecimalUtil::INT128_SCALE_UPPER_BOUND, -1); + TestGetScaleMultiplier<int256_t>(DecimalUtil::INT256_SCALE_UPPER_BOUND, -1); + TestGetScaleMultiplier<double>(DecimalUtil::INT64_SCALE_UPPER_BOUND, 1E19); +} + } diff --git a/be/src/util/decimal-util.h b/be/src/util/decimal-util.h index 6c1d94f19..b9bb8bf1e 100644 --- a/be/src/util/decimal-util.h +++ b/be/src/util/decimal-util.h @@ -34,6 +34,15 @@ namespace impala { class DecimalUtil { public: + // The scale's exclusive upper bound for GetScaleMultiplier<int32_t>() + static constexpr int INT32_SCALE_UPPER_BOUND = ColumnType::MAX_DECIMAL4_PRECISION + 1; + // The scale's exclusive upper bound for GetScaleMultiplier<int64_t>() + static constexpr int INT64_SCALE_UPPER_BOUND = ColumnType::MAX_DECIMAL8_PRECISION + 1; + // The scale's exclusive upper bound for GetScaleMultiplier<int128_t>() + static constexpr int INT128_SCALE_UPPER_BOUND = ColumnType::MAX_PRECISION + 1; + // The scale's exclusive upper bound for GetScaleMultiplier<int256_t>() + static constexpr int INT256_SCALE_UPPER_BOUND = 77; + // Helper function that checks for multiplication overflow. We only check for overflow // if may_overflow is false. template <typename T> @@ -166,7 +175,7 @@ class DecimalUtil { template <> inline int32_t DecimalUtil::GetScaleMultiplier<int32_t>(int scale) { DCHECK_GE(scale, 0); - static const int32_t values[] = { + static constexpr int32_t values[INT32_SCALE_UPPER_BOUND] = { 1, 10, 100, @@ -177,15 +186,14 @@ inline int32_t DecimalUtil::GetScaleMultiplier<int32_t>(int scale) { 10000000, 100000000, 1000000000}; - DCHECK_GE(sizeof(values) / sizeof(int32_t), ColumnType::MAX_DECIMAL4_PRECISION); - if (LIKELY(scale < 10)) return values[scale]; + if (LIKELY(scale < INT32_SCALE_UPPER_BOUND)) return values[scale]; return -1; // Overflow } template <> inline int64_t DecimalUtil::GetScaleMultiplier<int64_t>(int scale) { DCHECK_GE(scale, 0); - static const int64_t values[] = { + static constexpr int64_t values[INT64_SCALE_UPPER_BOUND] = { 1ll, 10ll, 100ll, @@ -205,15 +213,15 @@ inline int64_t DecimalUtil::GetScaleMultiplier<int64_t>(int scale) { 10000000000000000ll, 100000000000000000ll, 1000000000000000000ll}; - DCHECK_GE(sizeof(values) / sizeof(int64_t), ColumnType::MAX_DECIMAL8_PRECISION); - if (LIKELY(scale < 19)) return values[scale]; + if (LIKELY(scale < INT64_SCALE_UPPER_BOUND)) return values[scale]; return -1; // Overflow } template <> inline int128_t DecimalUtil::GetScaleMultiplier<int128_t>(int scale) { DCHECK_GE(scale, 0); - static const int128_t values[] = { + static constexpr int128_t i10e18{1000000000000000000ll}; + static constexpr int128_t values[INT128_SCALE_UPPER_BOUND] = { static_cast<int128_t>(1ll), static_cast<int128_t>(10ll), static_cast<int128_t>(100ll), @@ -232,29 +240,114 @@ inline int128_t DecimalUtil::GetScaleMultiplier<int128_t>(int scale) { static_cast<int128_t>(1000000000000000ll), static_cast<int128_t>(10000000000000000ll), static_cast<int128_t>(100000000000000000ll), - static_cast<int128_t>(1000000000000000000ll), - static_cast<int128_t>(1000000000000000000ll) * 10ll, - static_cast<int128_t>(1000000000000000000ll) * 100ll, - static_cast<int128_t>(1000000000000000000ll) * 1000ll, - static_cast<int128_t>(1000000000000000000ll) * 10000ll, - static_cast<int128_t>(1000000000000000000ll) * 100000ll, - static_cast<int128_t>(1000000000000000000ll) * 1000000ll, - static_cast<int128_t>(1000000000000000000ll) * 10000000ll, - static_cast<int128_t>(1000000000000000000ll) * 100000000ll, - static_cast<int128_t>(1000000000000000000ll) * 1000000000ll, - static_cast<int128_t>(1000000000000000000ll) * 10000000000ll, - static_cast<int128_t>(1000000000000000000ll) * 100000000000ll, - static_cast<int128_t>(1000000000000000000ll) * 1000000000000ll, - static_cast<int128_t>(1000000000000000000ll) * 10000000000000ll, - static_cast<int128_t>(1000000000000000000ll) * 100000000000000ll, - static_cast<int128_t>(1000000000000000000ll) * 1000000000000000ll, - static_cast<int128_t>(1000000000000000000ll) * 10000000000000000ll, - static_cast<int128_t>(1000000000000000000ll) * 100000000000000000ll, - static_cast<int128_t>(1000000000000000000ll) * 100000000000000000ll * 10ll, - static_cast<int128_t>(1000000000000000000ll) * 100000000000000000ll * 100ll, - static_cast<int128_t>(1000000000000000000ll) * 100000000000000000ll * 1000ll}; - DCHECK_GE(sizeof(values) / sizeof(int128_t), ColumnType::MAX_PRECISION); - if (LIKELY(scale < 39)) return values[scale]; + i10e18, + i10e18 * 10ll, + i10e18 * 100ll, + i10e18 * 1000ll, + i10e18 * 10000ll, + i10e18 * 100000ll, + i10e18 * 1000000ll, + i10e18 * 10000000ll, + i10e18 * 100000000ll, + i10e18 * 1000000000ll, + i10e18 * 10000000000ll, + i10e18 * 100000000000ll, + i10e18 * 1000000000000ll, + i10e18 * 10000000000000ll, + i10e18 * 100000000000000ll, + i10e18 * 1000000000000000ll, + i10e18 * 10000000000000000ll, + i10e18 * 100000000000000000ll, + i10e18 * i10e18, + i10e18 * i10e18 * 10ll, + i10e18 * i10e18 * 100ll}; + if (LIKELY(scale < INT128_SCALE_UPPER_BOUND)) return values[scale]; + return -1; // Overflow +} + +template <> +inline int256_t DecimalUtil::GetScaleMultiplier<int256_t>(int scale) { + DCHECK_GE(scale, 0); + static constexpr int256_t i10e18{1000000000000000000ll}; + static constexpr int256_t values[INT256_SCALE_UPPER_BOUND] = { + static_cast<int256_t>(1ll), + static_cast<int256_t>(10ll), + static_cast<int256_t>(100ll), + static_cast<int256_t>(1000ll), + static_cast<int256_t>(10000ll), + static_cast<int256_t>(100000ll), + static_cast<int256_t>(1000000ll), + static_cast<int256_t>(10000000ll), + static_cast<int256_t>(100000000ll), + static_cast<int256_t>(1000000000ll), + static_cast<int256_t>(10000000000ll), + static_cast<int256_t>(100000000000ll), + static_cast<int256_t>(1000000000000ll), + static_cast<int256_t>(10000000000000ll), + static_cast<int256_t>(100000000000000ll), + static_cast<int256_t>(1000000000000000ll), + static_cast<int256_t>(10000000000000000ll), + static_cast<int256_t>(100000000000000000ll), + i10e18, + i10e18 * 10ll, + i10e18 * 100ll, + i10e18 * 1000ll, + i10e18 * 10000ll, + i10e18 * 100000ll, + i10e18 * 1000000ll, + i10e18 * 10000000ll, + i10e18 * 100000000ll, + i10e18 * 1000000000ll, + i10e18 * 10000000000ll, + i10e18 * 100000000000ll, + i10e18 * 1000000000000ll, + i10e18 * 10000000000000ll, + i10e18 * 100000000000000ll, + i10e18 * 1000000000000000ll, + i10e18 * 10000000000000000ll, + i10e18 * 100000000000000000ll, + i10e18 * i10e18, + i10e18 * i10e18 * 10ll, + i10e18 * i10e18 * 100ll, + i10e18 * i10e18 * 1000ll, + i10e18 * i10e18 * 10000ll, + i10e18 * i10e18 * 100000ll, + i10e18 * i10e18 * 1000000ll, + i10e18 * i10e18 * 10000000ll, + i10e18 * i10e18 * 100000000ll, + i10e18 * i10e18 * 1000000000ll, + i10e18 * i10e18 * 10000000000ll, + i10e18 * i10e18 * 100000000000ll, + i10e18 * i10e18 * 1000000000000ll, + i10e18 * i10e18 * 10000000000000ll, + i10e18 * i10e18 * 100000000000000ll, + i10e18 * i10e18 * 1000000000000000ll, + i10e18 * i10e18 * 10000000000000000ll, + i10e18 * i10e18 * 100000000000000000ll, + i10e18 * i10e18 * i10e18, + i10e18 * i10e18 * i10e18 * 10ll, + i10e18 * i10e18 * i10e18 * 100ll, + i10e18 * i10e18 * i10e18 * 1000ll, + i10e18 * i10e18 * i10e18 * 10000ll, + i10e18 * i10e18 * i10e18 * 100000ll, + i10e18 * i10e18 * i10e18 * 1000000ll, + i10e18 * i10e18 * i10e18 * 10000000ll, + i10e18 * i10e18 * i10e18 * 100000000ll, + i10e18 * i10e18 * i10e18 * 1000000000ll, + i10e18 * i10e18 * i10e18 * 10000000000ll, + i10e18 * i10e18 * i10e18 * 100000000000ll, + i10e18 * i10e18 * i10e18 * 1000000000000ll, + i10e18 * i10e18 * i10e18 * 10000000000000ll, + i10e18 * i10e18 * i10e18 * 100000000000000ll, + i10e18 * i10e18 * i10e18 * 1000000000000000ll, + i10e18 * i10e18 * i10e18 * 10000000000000000ll, + i10e18 * i10e18 * i10e18 * 100000000000000000ll, + i10e18 * i10e18 * i10e18 * i10e18, + i10e18 * i10e18 * i10e18 * i10e18 * 10ll, + i10e18 * i10e18 * i10e18 * i10e18 * 100ll, + i10e18 * i10e18 * i10e18 * i10e18 * 1000ll, + i10e18 * i10e18 * i10e18 * i10e18 * 10000ll}; + if (LIKELY(scale < INT256_SCALE_UPPER_BOUND)) return values[scale]; return -1; // Overflow } }
