(doris) branch master updated: [Performance](opt) opt the memcpy and string compare performance (#35713)

zhangstar333 Sat, 01 Jun 2024 09:03:01 -0700

This is an automated email from the ASF dual-hosted git repository.

zhangstar333 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 2879d8f7d10 [Performance](opt) opt the memcpy and string compare 
performance (#35713)
2879d8f7d10 is described below

commit 2879d8f7d108b2352f3c592c49eb2d4f67738916
Author: HappenLee <happen...@hotmail.com>
AuthorDate: Sun Jun 2 00:02:35 2024 +0800

    [Performance](opt) opt the memcpy and string compare performance (#35713)
    
    ## Proposed changes
    
    Before：
    ```
    select count(cust_id) from     ( select * from rpt where dt = '2023-06-19' 
and dkye>0) a         join     ( select * from rpt1 where dt = '2023-06-19') b  
   on b.cust_id = a.hxkhh;
    +----------------+
    | count(cust_id) |
    +----------------+
    |      515127559 |
    +----------------+
    1 row in set (5.85 sec)
    
    ```
    
    after:
    ```
    select count(cust_id) from     ( select * from rpt where dt = '2023-06-19' 
and dkye>0) a         join     ( select * from rpt1 where dt = '2023-06-19') b  
   on b.cust_id = a.hxkhh;
    +----------------+
    | count(cust_id) |
    +----------------+
    |      515127559 |
    +----------------+
    1 row in set (4.25 sec)
    
    ```
    
    <!--Describe your changes.-->
    
    opt the memcpy and string compare performance
---
 be/src/gutil/strings/fastmem.h         | 159 ---------------------------------
 be/src/gutil/strings/stringpiece.h     |  10 +--
 be/src/util/bitmap.h                   |   4 +-
 be/src/util/faststring.h               |   4 +-
 be/src/util/memcpy_inlined.h           | 147 ++++++++++++++++++++++++++++++
 be/src/vec/columns/column_string.cpp   |   6 +-
 be/src/vec/common/string_ref.h         |   8 +-
 be/test/util/mysql_row_buffer_test.cpp |   3 -
 8 files changed, 160 insertions(+), 181 deletions(-)

diff --git a/be/src/gutil/strings/fastmem.h b/be/src/gutil/strings/fastmem.h
deleted file mode 100644
index dafc6c4a933..00000000000
--- a/be/src/gutil/strings/fastmem.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright 2008 Google Inc. All Rights Reserved.
-//
-// Fast memory copying and comparison routines.
-//   strings::fastmemcmp_inlined() replaces memcmp()
-//   strings::memcpy_inlined() replaces memcpy()
-//   strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
-//
-// strings::*_inlined() routines are inline versions of the
-// routines exported by this module.  Sometimes using the inlined
-// versions is faster.  Measure before using the inlined versions.
-//
-// Performance measurement:
-//   strings::fastmemcmp_inlined
-//     Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
-//     2012-01-30
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "gutil/integral_types.h"
-#include "gutil/port.h"
-
-namespace strings {
-
-// Return true if the n bytes at a equal the n bytes at b.
-// The regions are allowed to overlap.
-//
-// The performance is similar to the performance memcmp(), but faster for
-// moderately-sized inputs, or inputs that share a common prefix and differ
-// somewhere in their last 8 bytes. Further optimizations can be added later
-// if it makes sense to do so.
-inline bool memeq(const void* a_v, const void* b_v, size_t n) {
-    const uint8_t* a = reinterpret_cast<const uint8_t*>(a_v);
-    const uint8_t* b = reinterpret_cast<const uint8_t*>(b_v);
-
-    size_t n_rounded_down = n & ~static_cast<size_t>(7);
-    if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7
-        return memcmp(a, b, n) == 0;
-    }
-    // n >= 8
-    uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
-    uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
-    if ((u | v) != 0) { // The first or last 8 bytes differ.
-        return false;
-    }
-    a += 8;
-    b += 8;
-    n = n_rounded_down - 8;
-    if (n > 128) {
-        // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
-        // instructions, and while we could try to do something faster, it
-        // doesn't seem worth pursuing.
-        return memcmp(a, b, n) == 0;
-    }
-    for (; n >= 16; n -= 16) {
-        uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
-        uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
-        if ((x | y) != 0) {
-            return false;
-        }
-        a += 16;
-        b += 16;
-    }
-    // n must be 0 or 8 now because it was a multiple of 8 at the top of the 
loop.
-    return n == 0 || UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b);
-}
-
-inline int fastmemcmp_inlined(const void* a_void, const void* b_void, size_t 
n) {
-    const uint8_t* a = reinterpret_cast<const uint8_t*>(a_void);
-    const uint8_t* b = reinterpret_cast<const uint8_t*>(b_void);
-
-    if (n >= 64) {
-        return memcmp(a, b, n);
-    }
-    const void* a_limit = a + n;
-    const size_t sizeof_uint64 = sizeof(uint64); // NOLINT(runtime/sizeof)
-    while (a + sizeof_uint64 <= a_limit && UNALIGNED_LOAD64(a) == 
UNALIGNED_LOAD64(b)) {
-        a += sizeof_uint64;
-        b += sizeof_uint64;
-    }
-    const size_t sizeof_uint32 = sizeof(uint32); // NOLINT(runtime/sizeof)
-    if (a + sizeof_uint32 <= a_limit && UNALIGNED_LOAD32(a) == 
UNALIGNED_LOAD32(b)) {
-        a += sizeof_uint32;
-        b += sizeof_uint32;
-    }
-    while (a < a_limit) {
-        int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++);
-        if (d) return d;
-    }
-    return 0;
-}
-
-// The standard memcpy operation is slow for variable small sizes.
-// This implementation inlines the optimal realization for sizes 1 to 16.
-// To avoid code bloat don't use it in case of not performance-critical spots,
-// nor when you don't expect very frequent values of size <= 16.
-inline void memcpy_inlined(void* dst, const void* src, size_t size) {
-    // Compiler inlines code with minimal amount of data movement when third
-    // parameter of memcpy is a constant.
-    switch (size) {
-    case 1:
-        memcpy(dst, src, 1);
-        break;
-    case 2:
-        memcpy(dst, src, 2);
-        break;
-    case 3:
-        memcpy(dst, src, 3);
-        break;
-    case 4:
-        memcpy(dst, src, 4);
-        break;
-    case 5:
-        memcpy(dst, src, 5);
-        break;
-    case 6:
-        memcpy(dst, src, 6);
-        break;
-    case 7:
-        memcpy(dst, src, 7);
-        break;
-    case 8:
-        memcpy(dst, src, 8);
-        break;
-    case 9:
-        memcpy(dst, src, 9);
-        break;
-    case 10:
-        memcpy(dst, src, 10);
-        break;
-    case 11:
-        memcpy(dst, src, 11);
-        break;
-    case 12:
-        memcpy(dst, src, 12);
-        break;
-    case 13:
-        memcpy(dst, src, 13);
-        break;
-    case 14:
-        memcpy(dst, src, 14);
-        break;
-    case 15:
-        memcpy(dst, src, 15);
-        break;
-    case 16:
-        memcpy(dst, src, 16);
-        break;
-    default:
-        memcpy(dst, src, size);
-        break;
-    }
-}
-
-} // namespace strings
diff --git a/be/src/gutil/strings/stringpiece.h 
b/be/src/gutil/strings/stringpiece.h
index 60aeb5a2749..0ee2bcc47eb 100644
--- a/be/src/gutil/strings/stringpiece.h
+++ b/be/src/gutil/strings/stringpiece.h
@@ -114,14 +114,14 @@
 #include <assert.h>
 #include <stddef.h>
 #include <string.h>
-#include <iosfwd>
-#include <string>
+
 #include <cstddef>
+#include <iosfwd>
 #include <iterator>
-#include <string_view>
 #include <limits> // IWYU pragma: keep
+#include <string>
+#include <string_view>
 
-#include "gutil/strings/fastmem.h"
 #include "gutil/hash/string_hash.h"
 #include "gutil/int128.h"
 
@@ -296,7 +296,7 @@ inline bool operator==(StringPiece x, StringPiece y) {
         return false;
     }
 
-    return x.data() == y.data() || len <= 0 || strings::memeq(x.data(), 
y.data(), len);
+    return x.data() == y.data() || len <= 0 || 0 == memcmp(x.data(), y.data(), 
len);
 }
 
 inline bool operator!=(StringPiece x, StringPiece y) {
diff --git a/be/src/util/bitmap.h b/be/src/util/bitmap.h
index c40b082d0d1..32eeb64bd88 100644
--- a/be/src/util/bitmap.h
+++ b/be/src/util/bitmap.h
@@ -29,7 +29,6 @@
 #include <vector>
 
 #include "gutil/port.h"
-#include "gutil/strings/fastmem.h"
 #include "util/bit_util.h"
 
 namespace doris {
@@ -105,9 +104,8 @@ inline bool BitmapIsAllZero(const uint8_t* bitmap, size_t 
offset, size_t bitmap_
 //
 // It is assumed that both bitmaps have 'bitmap_size' number of bits.
 inline bool BitmapEquals(const uint8_t* bm1, const uint8_t* bm2, size_t 
bitmap_size) {
-    // Use memeq() to check all of the full bytes.
     size_t num_full_bytes = bitmap_size >> 3;
-    if (!strings::memeq(bm1, bm2, num_full_bytes)) {
+    if (memcmp(bm1, bm2, num_full_bytes)) {
         return false;
     }
 
diff --git a/be/src/util/faststring.h b/be/src/util/faststring.h
index 4edcce9836d..8d9fa6d004f 100644
--- a/be/src/util/faststring.h
+++ b/be/src/util/faststring.h
@@ -25,7 +25,7 @@
 
 #include "gutil/dynamic_annotations.h"
 #include "gutil/port.h"
-#include "gutil/strings/fastmem.h"
+#include "util/memcpy_inlined.h"
 #include "util/slice.h"
 #include "vec/common/allocator.h"
 
@@ -123,7 +123,7 @@ public:
                 *p++ = *src++;
             }
         } else {
-            strings::memcpy_inlined(&data_[len_], src, count);
+            memcpy_inlined(&data_[len_], src, count);
         }
         len_ += count;
     }
diff --git a/be/src/util/memcpy_inlined.h b/be/src/util/memcpy_inlined.h
new file mode 100644
index 00000000000..2b2063e5ca1
--- /dev/null
+++ b/be/src/util/memcpy_inlined.h
@@ -0,0 +1,147 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#pragma once
+#ifdef __AVX2__
+#include <emmintrin.h>
+#include <immintrin.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common/compiler_util.h"
+#include "gutil/integral_types.h"
+#include "gutil/port.h"
+
+namespace doris {
+
+ALWAYS_INLINE inline void memcpy_inlined(void* __restrict _dst, const void* 
__restrict _src,
+                                         size_t size) {
+    auto dst = static_cast<uint8_t*>(_dst);
+    auto src = static_cast<const uint8_t*>(_src);
+
+    [[maybe_unused]] tail :
+            /// Small sizes and tails after the loop for large sizes.
+            /// The order of branches is important but in fact the optimal 
order depends on the distribution of sizes in your application.
+            /// This order of branches is from the disassembly of glibc's code.
+            /// We copy chunks of possibly uneven size with two overlapping 
movs.
+            /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 
2, 3, 4] first and then head [0, 1, 2, 3].
+            if (size <= 16) {
+        if (size >= 8) {
+            /// Chunks of 8..16 bytes.
+            __builtin_memcpy(dst + size - 8, src + size - 8, 8);
+            __builtin_memcpy(dst, src, 8);
+        } else if (size >= 4) {
+            /// Chunks of 4..7 bytes.
+            __builtin_memcpy(dst + size - 4, src + size - 4, 4);
+            __builtin_memcpy(dst, src, 4);
+        } else if (size >= 2) {
+            /// Chunks of 2..3 bytes.
+            __builtin_memcpy(dst + size - 2, src + size - 2, 2);
+            __builtin_memcpy(dst, src, 2);
+        } else if (size >= 1) {
+            /// A single byte.
+            *dst = *src;
+        }
+        /// No bytes remaining.
+    }
+    else {
+#ifdef __AVX2__
+        if (size <= 256) {
+            if (size <= 32) {
+                __builtin_memcpy(dst, src, 8);
+                __builtin_memcpy(dst + 8, src + 8, 8);
+                size -= 16;
+                dst += 16;
+                src += 16;
+                goto tail;
+            }
+
+            /// Then we will copy every 16 bytes from the beginning in a loop.
+            /// The last loop iteration will possibly overwrite some part of 
already copied last 32 bytes.
+            /// This is Ok, similar to the code for small sizes above.
+            while (size > 32) {
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst),
+                                    _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src)));
+                dst += 32;
+                src += 32;
+                size -= 32;
+            }
+
+            _mm256_storeu_si256(
+                    reinterpret_cast<__m256i*>(dst + size - 32),
+                    _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 
size - 32)));
+        } else {
+            if (size >= 512 * 1024 && size <= 2048 * 1024) {
+                asm volatile("rep movsb"
+                             : "=D"(dst), "=S"(src), "=c"(size)
+                             : "0"(dst), "1"(src), "2"(size)
+                             : "memory");
+            } else {
+                size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 
31;
+
+                if (padding > 0) {
+                    __m256i head = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src));
+                    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
+                    dst += padding;
+                    src += padding;
+                    size -= padding;
+                }
+
+                /// Aligned unrolled copy. We will use half of available AVX 
registers.
+                /// It's not possible to have both src and dst aligned.
+                /// So, we will use aligned stores and unaligned loads.
+                __m256i c0, c1, c2, c3, c4, c5, c6, c7;
+
+                while (size >= 256) {
+                    c0 = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src));
+                    c1 = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src + 32));
+                    c2 = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src + 64));
+                    c3 = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src + 96));
+                    c4 = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src + 128));
+                    c5 = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src + 160));
+                    c6 = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src + 192));
+                    c7 = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(src + 224));
+                    src += 256;
+
+                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0);
+                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), 
c1);
+                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), 
c2);
+                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), 
c3);
+                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 
128)), c4);
+                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 
160)), c5);
+                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 
192)), c6);
+                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 
224)), c7);
+                    dst += 256;
+
+                    size -= 256;
+                }
+
+                goto tail;
+            }
+        }
+#else
+        memcpy(dst, src, size);
+#endif
+    }
+}
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/vec/columns/column_string.cpp 
b/be/src/vec/columns/column_string.cpp
index c737cd5dd5d..446fd283b1c 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -24,6 +24,7 @@
 #include <boost/iterator/iterator_facade.hpp>
 #include <ostream>
 
+#include "util/memcpy_inlined.h"
 #include "util/simd/bits.h"
 #include "vec/columns/columns_common.h"
 #include "vec/common/arena.h"
@@ -195,8 +196,9 @@ void ColumnStr<T>::insert_indices_from(const IColumn& src, 
const uint32_t* indic
             const size_t size_to_append =
                     src_offset_data[src_offset] - src_offset_data[src_offset - 
1];
             const size_t offset = src_offset_data[src_offset - 1];
-            memcpy_small_allow_read_write_overflow15(dst_data_ptr + 
dst_chars_pos,
-                                                     src_data_ptr + offset, 
size_to_append);
+
+            memcpy_inlined(dst_data_ptr + dst_chars_pos, src_data_ptr + 
offset, size_to_append);
+
             dst_chars_pos += size_to_append;
         }
     };
diff --git a/be/src/vec/common/string_ref.h b/be/src/vec/common/string_ref.h
index 72a7a911210..9851bf8f3bf 100644
--- a/be/src/vec/common/string_ref.h
+++ b/be/src/vec/common/string_ref.h
@@ -270,13 +270,7 @@ struct StringRef {
 
     // ==
     bool eq(const StringRef& other) const {
-        if (this->size != other.size) {
-            return false;
-        }
-#if defined(__SSE2__) || defined(__aarch64__)
-        return memequalSSE2Wide(this->data, other.data, this->size);
-#endif
-        return string_compare(this->data, this->size, other.data, other.size, 
this->size) == 0;
+        return (size == other.size) && (memcmp(data, other.data, size) == 0);
     }
 
     bool operator==(const StringRef& other) const { return eq(other); }
diff --git a/be/test/util/mysql_row_buffer_test.cpp 
b/be/test/util/mysql_row_buffer_test.cpp
index cfe012fdcd5..7057b189022 100644
--- a/be/test/util/mysql_row_buffer_test.cpp
+++ b/be/test/util/mysql_row_buffer_test.cpp
@@ -24,12 +24,9 @@
 #include <string>
 
 #include "gtest/gtest_pred_impl.h"
-#include "gutil/strings/fastmem.h"
 
 namespace doris {
 
-using namespace strings;
-
 TEST(MysqlRowBufferTest, basic) {
     MysqlRowBuffer mrb;
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch master updated: [Performance](opt) opt the memcpy and string compare performance (#35713)

Reply via email to