This is an automated email from the ASF dual-hosted git repository. zhangstar333 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 2879d8f7d10 [Performance](opt) opt the memcpy and string compare performance (#35713) 2879d8f7d10 is described below commit 2879d8f7d108b2352f3c592c49eb2d4f67738916 Author: HappenLee <happen...@hotmail.com> AuthorDate: Sun Jun 2 00:02:35 2024 +0800 [Performance](opt) opt the memcpy and string compare performance (#35713) ## Proposed changes Beforeļ¼ ``` select count(cust_id) from ( select * from rpt where dt = '2023-06-19' and dkye>0) a join ( select * from rpt1 where dt = '2023-06-19') b on b.cust_id = a.hxkhh; +----------------+ | count(cust_id) | +----------------+ | 515127559 | +----------------+ 1 row in set (5.85 sec) ``` after: ``` select count(cust_id) from ( select * from rpt where dt = '2023-06-19' and dkye>0) a join ( select * from rpt1 where dt = '2023-06-19') b on b.cust_id = a.hxkhh; +----------------+ | count(cust_id) | +----------------+ | 515127559 | +----------------+ 1 row in set (4.25 sec) ``` <!--Describe your changes.--> opt the memcpy and string compare performance --- be/src/gutil/strings/fastmem.h | 159 --------------------------------- be/src/gutil/strings/stringpiece.h | 10 +-- be/src/util/bitmap.h | 4 +- be/src/util/faststring.h | 4 +- be/src/util/memcpy_inlined.h | 147 ++++++++++++++++++++++++++++++ be/src/vec/columns/column_string.cpp | 6 +- be/src/vec/common/string_ref.h | 8 +- be/test/util/mysql_row_buffer_test.cpp | 3 - 8 files changed, 160 insertions(+), 181 deletions(-) diff --git a/be/src/gutil/strings/fastmem.h b/be/src/gutil/strings/fastmem.h deleted file mode 100644 index dafc6c4a933..00000000000 --- a/be/src/gutil/strings/fastmem.h +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2008 Google Inc. All Rights Reserved. -// -// Fast memory copying and comparison routines. -// strings::fastmemcmp_inlined() replaces memcmp() -// strings::memcpy_inlined() replaces memcpy() -// strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0 -// -// strings::*_inlined() routines are inline versions of the -// routines exported by this module. Sometimes using the inlined -// versions is faster. Measure before using the inlined versions. -// -// Performance measurement: -// strings::fastmemcmp_inlined -// Analysis: memcmp, fastmemcmp_inlined, fastmemcmp -// 2012-01-30 - -#pragma once - -#include <stddef.h> -#include <stdint.h> -#include <stdio.h> -#include <string.h> - -#include "gutil/integral_types.h" -#include "gutil/port.h" - -namespace strings { - -// Return true if the n bytes at a equal the n bytes at b. -// The regions are allowed to overlap. -// -// The performance is similar to the performance memcmp(), but faster for -// moderately-sized inputs, or inputs that share a common prefix and differ -// somewhere in their last 8 bytes. Further optimizations can be added later -// if it makes sense to do so. -inline bool memeq(const void* a_v, const void* b_v, size_t n) { - const uint8_t* a = reinterpret_cast<const uint8_t*>(a_v); - const uint8_t* b = reinterpret_cast<const uint8_t*>(b_v); - - size_t n_rounded_down = n & ~static_cast<size_t>(7); - if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7 - return memcmp(a, b, n) == 0; - } - // n >= 8 - uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); - uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8); - if ((u | v) != 0) { // The first or last 8 bytes differ. - return false; - } - a += 8; - b += 8; - n = n_rounded_down - 8; - if (n > 128) { - // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2 - // instructions, and while we could try to do something faster, it - // doesn't seem worth pursuing. - return memcmp(a, b, n) == 0; - } - for (; n >= 16; n -= 16) { - uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); - uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8); - if ((x | y) != 0) { - return false; - } - a += 16; - b += 16; - } - // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop. - return n == 0 || UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b); -} - -inline int fastmemcmp_inlined(const void* a_void, const void* b_void, size_t n) { - const uint8_t* a = reinterpret_cast<const uint8_t*>(a_void); - const uint8_t* b = reinterpret_cast<const uint8_t*>(b_void); - - if (n >= 64) { - return memcmp(a, b, n); - } - const void* a_limit = a + n; - const size_t sizeof_uint64 = sizeof(uint64); // NOLINT(runtime/sizeof) - while (a + sizeof_uint64 <= a_limit && UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) { - a += sizeof_uint64; - b += sizeof_uint64; - } - const size_t sizeof_uint32 = sizeof(uint32); // NOLINT(runtime/sizeof) - if (a + sizeof_uint32 <= a_limit && UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) { - a += sizeof_uint32; - b += sizeof_uint32; - } - while (a < a_limit) { - int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++); - if (d) return d; - } - return 0; -} - -// The standard memcpy operation is slow for variable small sizes. -// This implementation inlines the optimal realization for sizes 1 to 16. -// To avoid code bloat don't use it in case of not performance-critical spots, -// nor when you don't expect very frequent values of size <= 16. -inline void memcpy_inlined(void* dst, const void* src, size_t size) { - // Compiler inlines code with minimal amount of data movement when third - // parameter of memcpy is a constant. - switch (size) { - case 1: - memcpy(dst, src, 1); - break; - case 2: - memcpy(dst, src, 2); - break; - case 3: - memcpy(dst, src, 3); - break; - case 4: - memcpy(dst, src, 4); - break; - case 5: - memcpy(dst, src, 5); - break; - case 6: - memcpy(dst, src, 6); - break; - case 7: - memcpy(dst, src, 7); - break; - case 8: - memcpy(dst, src, 8); - break; - case 9: - memcpy(dst, src, 9); - break; - case 10: - memcpy(dst, src, 10); - break; - case 11: - memcpy(dst, src, 11); - break; - case 12: - memcpy(dst, src, 12); - break; - case 13: - memcpy(dst, src, 13); - break; - case 14: - memcpy(dst, src, 14); - break; - case 15: - memcpy(dst, src, 15); - break; - case 16: - memcpy(dst, src, 16); - break; - default: - memcpy(dst, src, size); - break; - } -} - -} // namespace strings diff --git a/be/src/gutil/strings/stringpiece.h b/be/src/gutil/strings/stringpiece.h index 60aeb5a2749..0ee2bcc47eb 100644 --- a/be/src/gutil/strings/stringpiece.h +++ b/be/src/gutil/strings/stringpiece.h @@ -114,14 +114,14 @@ #include <assert.h> #include <stddef.h> #include <string.h> -#include <iosfwd> -#include <string> + #include <cstddef> +#include <iosfwd> #include <iterator> -#include <string_view> #include <limits> // IWYU pragma: keep +#include <string> +#include <string_view> -#include "gutil/strings/fastmem.h" #include "gutil/hash/string_hash.h" #include "gutil/int128.h" @@ -296,7 +296,7 @@ inline bool operator==(StringPiece x, StringPiece y) { return false; } - return x.data() == y.data() || len <= 0 || strings::memeq(x.data(), y.data(), len); + return x.data() == y.data() || len <= 0 || 0 == memcmp(x.data(), y.data(), len); } inline bool operator!=(StringPiece x, StringPiece y) { diff --git a/be/src/util/bitmap.h b/be/src/util/bitmap.h index c40b082d0d1..32eeb64bd88 100644 --- a/be/src/util/bitmap.h +++ b/be/src/util/bitmap.h @@ -29,7 +29,6 @@ #include <vector> #include "gutil/port.h" -#include "gutil/strings/fastmem.h" #include "util/bit_util.h" namespace doris { @@ -105,9 +104,8 @@ inline bool BitmapIsAllZero(const uint8_t* bitmap, size_t offset, size_t bitmap_ // // It is assumed that both bitmaps have 'bitmap_size' number of bits. inline bool BitmapEquals(const uint8_t* bm1, const uint8_t* bm2, size_t bitmap_size) { - // Use memeq() to check all of the full bytes. size_t num_full_bytes = bitmap_size >> 3; - if (!strings::memeq(bm1, bm2, num_full_bytes)) { + if (memcmp(bm1, bm2, num_full_bytes)) { return false; } diff --git a/be/src/util/faststring.h b/be/src/util/faststring.h index 4edcce9836d..8d9fa6d004f 100644 --- a/be/src/util/faststring.h +++ b/be/src/util/faststring.h @@ -25,7 +25,7 @@ #include "gutil/dynamic_annotations.h" #include "gutil/port.h" -#include "gutil/strings/fastmem.h" +#include "util/memcpy_inlined.h" #include "util/slice.h" #include "vec/common/allocator.h" @@ -123,7 +123,7 @@ public: *p++ = *src++; } } else { - strings::memcpy_inlined(&data_[len_], src, count); + memcpy_inlined(&data_[len_], src, count); } len_ += count; } diff --git a/be/src/util/memcpy_inlined.h b/be/src/util/memcpy_inlined.h new file mode 100644 index 00000000000..2b2063e5ca1 --- /dev/null +++ b/be/src/util/memcpy_inlined.h @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#pragma once +#ifdef __AVX2__ +#include <emmintrin.h> +#include <immintrin.h> +#endif + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include "common/compiler_util.h" +#include "gutil/integral_types.h" +#include "gutil/port.h" + +namespace doris { + +ALWAYS_INLINE inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src, + size_t size) { + auto dst = static_cast<uint8_t*>(_dst); + auto src = static_cast<const uint8_t*>(_src); + + [[maybe_unused]] tail : + /// Small sizes and tails after the loop for large sizes. + /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application. + /// This order of branches is from the disassembly of glibc's code. + /// We copy chunks of possibly uneven size with two overlapping movs. + /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3]. + if (size <= 16) { + if (size >= 8) { + /// Chunks of 8..16 bytes. + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } else if (size >= 4) { + /// Chunks of 4..7 bytes. + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } else if (size >= 2) { + /// Chunks of 2..3 bytes. + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } else if (size >= 1) { + /// A single byte. + *dst = *src; + } + /// No bytes remaining. + } + else { +#ifdef __AVX2__ + if (size <= 256) { + if (size <= 32) { + __builtin_memcpy(dst, src, 8); + __builtin_memcpy(dst + 8, src + 8, 8); + size -= 16; + dst += 16; + src += 16; + goto tail; + } + + /// Then we will copy every 16 bytes from the beginning in a loop. + /// The last loop iteration will possibly overwrite some part of already copied last 32 bytes. + /// This is Ok, similar to the code for small sizes above. + while (size > 32) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), + _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src))); + dst += 32; + src += 32; + size -= 32; + } + + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(dst + size - 32), + _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32))); + } else { + if (size >= 512 * 1024 && size <= 2048 * 1024) { + asm volatile("rep movsb" + : "=D"(dst), "=S"(src), "=c"(size) + : "0"(dst), "1"(src), "2"(size) + : "memory"); + } else { + size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31; + + if (padding > 0) { + __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. We will use half of available AVX registers. + /// It's not possible to have both src and dst aligned. + /// So, we will use aligned stores and unaligned loads. + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 256) { + c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)); + c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32)); + c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64)); + c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96)); + c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128)); + c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160)); + c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192)); + c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224)); + src += 256; + + _mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7); + dst += 256; + + size -= 256; + } + + goto tail; + } + } +#else + memcpy(dst, src, size); +#endif + } +} +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index c737cd5dd5d..446fd283b1c 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -24,6 +24,7 @@ #include <boost/iterator/iterator_facade.hpp> #include <ostream> +#include "util/memcpy_inlined.h" #include "util/simd/bits.h" #include "vec/columns/columns_common.h" #include "vec/common/arena.h" @@ -195,8 +196,9 @@ void ColumnStr<T>::insert_indices_from(const IColumn& src, const uint32_t* indic const size_t size_to_append = src_offset_data[src_offset] - src_offset_data[src_offset - 1]; const size_t offset = src_offset_data[src_offset - 1]; - memcpy_small_allow_read_write_overflow15(dst_data_ptr + dst_chars_pos, - src_data_ptr + offset, size_to_append); + + memcpy_inlined(dst_data_ptr + dst_chars_pos, src_data_ptr + offset, size_to_append); + dst_chars_pos += size_to_append; } }; diff --git a/be/src/vec/common/string_ref.h b/be/src/vec/common/string_ref.h index 72a7a911210..9851bf8f3bf 100644 --- a/be/src/vec/common/string_ref.h +++ b/be/src/vec/common/string_ref.h @@ -270,13 +270,7 @@ struct StringRef { // == bool eq(const StringRef& other) const { - if (this->size != other.size) { - return false; - } -#if defined(__SSE2__) || defined(__aarch64__) - return memequalSSE2Wide(this->data, other.data, this->size); -#endif - return string_compare(this->data, this->size, other.data, other.size, this->size) == 0; + return (size == other.size) && (memcmp(data, other.data, size) == 0); } bool operator==(const StringRef& other) const { return eq(other); } diff --git a/be/test/util/mysql_row_buffer_test.cpp b/be/test/util/mysql_row_buffer_test.cpp index cfe012fdcd5..7057b189022 100644 --- a/be/test/util/mysql_row_buffer_test.cpp +++ b/be/test/util/mysql_row_buffer_test.cpp @@ -24,12 +24,9 @@ #include <string> #include "gtest/gtest_pred_impl.h" -#include "gutil/strings/fastmem.h" namespace doris { -using namespace strings; - TEST(MysqlRowBufferTest, basic) { MysqlRowBuffer mrb; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org