This is an automated email from the ASF dual-hosted git repository.

gabriellee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d913ca5731 [Opt](vectorized) Speed up bucket shuffle join hash compute 
(#12407)
d913ca5731 is described below

commit d913ca573116139b4cb95f6f649ce2a4b3870788
Author: HappenLee <happen...@hotmail.com>
AuthorDate: Tue Sep 13 20:19:22 2022 +0800

    [Opt](vectorized) Speed up bucket shuffle join hash compute (#12407)
    
    * [Opt](vectorized) Speed up bucket shuffle join hash compute
---
 be/src/exec/tablet_info.cpp                        | 12 ++---
 be/src/runtime/data_stream_sender.cpp              |  5 +-
 be/src/runtime/define_primitive_type.h             | 58 ++++++++++++++++++++++
 be/src/runtime/primitive_type.h                    | 38 +-------------
 be/src/runtime/raw_value.h                         | 27 ++--------
 be/src/util/hash_util.hpp                          |  7 +++
 be/src/vec/columns/column.h                        | 21 ++++++++
 be/src/vec/columns/column_const.cpp                | 18 +++++++
 be/src/vec/columns/column_const.h                  |  3 ++
 be/src/vec/columns/column_decimal.cpp              | 33 ++++++++++++
 be/src/vec/columns/column_decimal.h                |  2 +
 be/src/vec/columns/column_nullable.cpp             | 19 +++++++
 be/src/vec/columns/column_nullable.h               |  2 +
 be/src/vec/columns/column_string.cpp               | 20 ++++++++
 be/src/vec/columns/column_string.h                 |  3 ++
 be/src/vec/columns/column_vector.cpp               | 33 +++++++++++-
 be/src/vec/columns/column_vector.h                 |  3 ++
 be/src/vec/sink/vdata_stream_sender.cpp            | 18 ++-----
 .../org/apache/doris/analysis/DateLiteral.java     | 28 ++++++++---
 .../org/apache/doris/analysis/DecimalLiteral.java  | 16 ++++--
 .../org/apache/doris/analysis/LargeIntLiteral.java |  6 +++
 21 files changed, 274 insertions(+), 98 deletions(-)

diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp
index c89e03f0be..08e77d9565 100644
--- a/be/src/exec/tablet_info.cpp
+++ b/be/src/exec/tablet_info.cpp
@@ -220,10 +220,7 @@ Status OlapTablePartitionParam::init() {
                 if (slot != nullptr) {
                     hash_val = RawValue::zlib_crc32(slot, slot_desc->type(), 
hash_val);
                 } else {
-                    //nullptr is treat as 0 when hash
-                    static const int INT_VALUE = 0;
-                    static const TypeDescriptor INT_TYPE(TYPE_INT);
-                    hash_val = RawValue::zlib_crc32(&INT_VALUE, INT_TYPE, 
hash_val);
+                    hash_val = HashUtil::zlib_crc_hash_null(hash_val);
                 }
             }
             return hash_val % num_buckets;
@@ -492,16 +489,13 @@ Status VOlapTablePartitionParam::init() {
             uint32_t hash_val = 0;
             for (int i = 0; i < _distributed_slot_locs.size(); ++i) {
                 auto slot_desc = _slots[_distributed_slot_locs[i]];
-                auto column = 
key->first->get_by_position(_distributed_slot_locs[i]).column;
+                auto& column = 
key->first->get_by_position(_distributed_slot_locs[i]).column;
                 auto val = column->get_data_at(key->second);
                 if (val.data != nullptr) {
                     hash_val = RawValue::zlib_crc32(val.data, val.size, 
slot_desc->type().type,
                                                     hash_val);
                 } else {
-                    // NULL is treat as 0 when hash
-                    static const int INT_VALUE = 0;
-                    static const TypeDescriptor INT_TYPE(TYPE_INT);
-                    hash_val = RawValue::zlib_crc32(&INT_VALUE, INT_TYPE, 
hash_val);
+                    hash_val = HashUtil::zlib_crc_hash_null(hash_val);
                 }
             }
             return hash_val % num_buckets;
diff --git a/be/src/runtime/data_stream_sender.cpp 
b/be/src/runtime/data_stream_sender.cpp
index 13b2308d7b..38d5aa1d51 100644
--- a/be/src/runtime/data_stream_sender.cpp
+++ b/be/src/runtime/data_stream_sender.cpp
@@ -626,10 +626,7 @@ Status DataStreamSender::process_distribute(RuntimeState* 
state, TupleRow* row,
         if (partition_val != nullptr) {
             hash_val = RawValue::zlib_crc32(partition_val, 
ctx->root()->type(), hash_val);
         } else {
-            //nullptr is treat as 0 when hash
-            static const int INT_VALUE = 0;
-            static const TypeDescriptor INT_TYPE(TYPE_INT);
-            hash_val = RawValue::zlib_crc32(&INT_VALUE, INT_TYPE, hash_val);
+            hash_val = HashUtil::zlib_crc_hash_null(hash_val);
         }
     }
     hash_val %= part->distributed_bucket();
diff --git a/be/src/runtime/define_primitive_type.h 
b/be/src/runtime/define_primitive_type.h
new file mode 100644
index 0000000000..aa5e140a6e
--- /dev/null
+++ b/be/src/runtime/define_primitive_type.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace doris {
+enum PrimitiveType {
+    INVALID_TYPE = 0,
+    TYPE_NULL,     /* 1 */
+    TYPE_BOOLEAN,  /* 2 */
+    TYPE_TINYINT,  /* 3 */
+    TYPE_SMALLINT, /* 4 */
+    TYPE_INT,      /* 5 */
+    TYPE_BIGINT,   /* 6 */
+    TYPE_LARGEINT, /* 7 */
+    TYPE_FLOAT,    /* 8 */
+    TYPE_DOUBLE,   /* 9 */
+    TYPE_VARCHAR,  /* 10 */
+    TYPE_DATE,     /* 11 */
+    TYPE_DATETIME, /* 12 */
+    TYPE_BINARY,
+    /* 13 */                     // Not implemented
+    TYPE_DECIMAL [[deprecated]], /* 14 */
+    TYPE_CHAR,                   /* 15 */
+
+    TYPE_STRUCT,    /* 16 */
+    TYPE_ARRAY,     /* 17 */
+    TYPE_MAP,       /* 18 */
+    TYPE_HLL,       /* 19 */
+    TYPE_DECIMALV2, /* 20 */
+
+    TYPE_TIME,           /* 21 */
+    TYPE_OBJECT,         /* 22 */
+    TYPE_STRING,         /* 23 */
+    TYPE_QUANTILE_STATE, /* 24 */
+    TYPE_DATEV2,         /* 25 */
+    TYPE_DATETIMEV2,     /* 26 */
+    TYPE_TIMEV2,         /* 27 */
+    TYPE_DECIMAL32,      /* 28 */
+    TYPE_DECIMAL64,      /* 29 */
+    TYPE_DECIMAL128,     /* 30 */
+};
+
+}
diff --git a/be/src/runtime/primitive_type.h b/be/src/runtime/primitive_type.h
index 7fc6a728de..45efe0336c 100644
--- a/be/src/runtime/primitive_type.h
+++ b/be/src/runtime/primitive_type.h
@@ -19,6 +19,7 @@
 
 #include <string>
 
+#include "runtime/define_primitive_type.h"
 #include "vec/columns/column_decimal.h"
 #include "vec/columns/columns_number.h"
 #include "vec/core/types.h"
@@ -33,43 +34,6 @@ class DateTimeValue;
 class DecimalV2Value;
 struct StringValue;
 
-enum PrimitiveType {
-    INVALID_TYPE = 0,
-    TYPE_NULL,     /* 1 */
-    TYPE_BOOLEAN,  /* 2 */
-    TYPE_TINYINT,  /* 3 */
-    TYPE_SMALLINT, /* 4 */
-    TYPE_INT,      /* 5 */
-    TYPE_BIGINT,   /* 6 */
-    TYPE_LARGEINT, /* 7 */
-    TYPE_FLOAT,    /* 8 */
-    TYPE_DOUBLE,   /* 9 */
-    TYPE_VARCHAR,  /* 10 */
-    TYPE_DATE,     /* 11 */
-    TYPE_DATETIME, /* 12 */
-    TYPE_BINARY,
-    /* 13 */                     // Not implemented
-    TYPE_DECIMAL [[deprecated]], /* 14 */
-    TYPE_CHAR,                   /* 15 */
-
-    TYPE_STRUCT,    /* 16 */
-    TYPE_ARRAY,     /* 17 */
-    TYPE_MAP,       /* 18 */
-    TYPE_HLL,       /* 19 */
-    TYPE_DECIMALV2, /* 20 */
-
-    TYPE_TIME,           /* 21 */
-    TYPE_OBJECT,         /* 22 */
-    TYPE_STRING,         /* 23 */
-    TYPE_QUANTILE_STATE, /* 24 */
-    TYPE_DATEV2,         /* 25 */
-    TYPE_DATETIMEV2,     /* 26 */
-    TYPE_TIMEV2,         /* 27 */
-    TYPE_DECIMAL32,      /* 28 */
-    TYPE_DECIMAL64,      /* 29 */
-    TYPE_DECIMAL128,     /* 30 */
-};
-
 PrimitiveType convert_type_to_primitive(FunctionContext::Type type);
 
 bool is_enumeration_type(PrimitiveType type);
diff --git a/be/src/runtime/raw_value.h b/be/src/runtime/raw_value.h
index 7c5f990061..990e8b9015 100644
--- a/be/src/runtime/raw_value.h
+++ b/be/src/runtime/raw_value.h
@@ -442,10 +442,15 @@ inline uint32_t RawValue::zlib_crc32(const void* v, const 
TypeDescriptor& type,
     case TYPE_SMALLINT:
         return HashUtil::zlib_crc_hash(v, 2, seed);
     case TYPE_INT:
+    case TYPE_DATEV2:
+    case TYPE_DECIMAL32:
         return HashUtil::zlib_crc_hash(v, 4, seed);
     case TYPE_BIGINT:
+    case TYPE_DATETIMEV2:
+    case TYPE_DECIMAL64:
         return HashUtil::zlib_crc_hash(v, 8, seed);
     case TYPE_LARGEINT:
+    case TYPE_DECIMAL128:
         return HashUtil::zlib_crc_hash(v, 16, seed);
     case TYPE_FLOAT:
         return HashUtil::zlib_crc_hash(v, 4, seed);
@@ -458,21 +463,6 @@ inline uint32_t RawValue::zlib_crc32(const void* v, const 
TypeDescriptor& type,
         int len = date_val->to_buffer(buf);
         return HashUtil::zlib_crc_hash(buf, len, seed);
     }
-    case TYPE_DATEV2: {
-        const vectorized::DateV2Value<doris::vectorized::DateV2ValueType>* 
date_v2_val =
-                (const 
vectorized::DateV2Value<doris::vectorized::DateV2ValueType>*)v;
-        char buf[64];
-        int len = date_v2_val->to_buffer(buf);
-        return HashUtil::zlib_crc_hash(buf, len, seed);
-    }
-
-    case TYPE_DATETIMEV2: {
-        const vectorized::DateV2Value<doris::vectorized::DateTimeV2ValueType>* 
date_v2_val =
-                (const 
vectorized::DateV2Value<doris::vectorized::DateTimeV2ValueType>*)v;
-        char buf[64];
-        int len = date_v2_val->to_buffer(buf);
-        return HashUtil::zlib_crc_hash(buf, len, seed);
-    }
 
     case TYPE_DECIMALV2: {
         const DecimalV2Value* dec_val = (const DecimalV2Value*)v;
@@ -481,13 +471,6 @@ inline uint32_t RawValue::zlib_crc32(const void* v, const 
TypeDescriptor& type,
         seed = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), seed);
         return HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), seed);
     }
-
-    case TYPE_DECIMAL32:
-        return HashUtil::zlib_crc_hash(v, 4, seed);
-    case TYPE_DECIMAL64:
-        return HashUtil::zlib_crc_hash(v, 8, seed);
-    case TYPE_DECIMAL128:
-        return HashUtil::zlib_crc_hash(v, 16, seed);
     default:
         DCHECK(false) << "invalid type: " << type;
         return 0;
diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp
index 3a7ac11704..0ebcc6e11b 100644
--- a/be/src/util/hash_util.hpp
+++ b/be/src/util/hash_util.hpp
@@ -47,6 +47,13 @@ public:
     static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t 
hash) {
         return crc32(hash, (const unsigned char*)data, bytes);
     }
+
+    static uint32_t zlib_crc_hash_null(uint32_t hash) {
+        // null is treat as 0 when hash
+        static const int INT_VALUE = 0;
+        return crc32(hash, (const unsigned char*)(&INT_VALUE), 4);
+    }
+
 #if defined(__SSE4_2__) || defined(__aarch64__)
     // Compute the Crc32 hash for data using SSE4 instructions.  The input 
hash parameter is
     // the current hash/seed value.
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 71d737274c..6d085fa7c7 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "runtime/define_primitive_type.h"
 #include "vec/common/cow.h"
 #include "vec/common/exception.h"
 #include "vec/common/pod_array_fwd.h"
@@ -43,6 +44,18 @@ class SipHash;
         }                                                                \
     }
 
+#define DO_CRC_HASHES_FUNCTION_COLUMN_IMPL()                                   
      \
+    if (null_data == nullptr) {                                                
      \
+        for (size_t i = 0; i < s; i++) {                                       
      \
+            hashes[i] = HashUtil::zlib_crc_hash(&data[i], sizeof(T), 
hashes[i]);     \
+        }                                                                      
      \
+    } else {                                                                   
      \
+        for (size_t i = 0; i < s; i++) {                                       
      \
+            if (null_data[i] == 0)                                             
      \
+                hashes[i] = HashUtil::zlib_crc_hash(&data[i], sizeof(T), 
hashes[i]); \
+        }                                                                      
      \
+    }
+
 namespace doris::vectorized {
 
 class Arena;
@@ -322,6 +335,14 @@ public:
         LOG(FATAL) << "update_hashes_with_value not supported";
     };
 
+    /// Update state of crc32 hash function with value of n elements to avoid 
the virtual function call
+    /// null_data to mark whether need to do hash compute, null_data == nullptr
+    /// means all element need to do hash function, else only *null_data != 0 
need to do hash func
+    virtual void update_crcs_with_value(std::vector<uint32_t>& hash, 
PrimitiveType type,
+                                        const uint8_t* __restrict null_data = 
nullptr) const {
+        LOG(FATAL) << "update_crcs_with_value not supported";
+    };
+
     /** Removes elements that don't match the filter.
       * Is used in WHERE and HAVING operations.
       * If result_size_hint > 0, then makes advance reserve(result_size_hint) 
for the result column;
diff --git a/be/src/vec/columns/column_const.cpp 
b/be/src/vec/columns/column_const.cpp
index 946074cf61..4bd72ccf8d 100644
--- a/be/src/vec/columns/column_const.cpp
+++ b/be/src/vec/columns/column_const.cpp
@@ -20,6 +20,7 @@
 
 #include "vec/columns/column_const.h"
 
+#include "runtime/raw_value.h"
 #include "vec/columns/columns_common.h"
 #include "vec/common/pod_array.h"
 #include "vec/common/sip_hash.h"
@@ -103,6 +104,23 @@ void 
ColumnConst::update_hashes_with_value(std::vector<SipHash>& hashes,
     }
 }
 
+void ColumnConst::update_crcs_with_value(std::vector<uint32_t>& hashes, 
doris::PrimitiveType type,
+                                         const uint8_t* __restrict null_data) 
const {
+    DCHECK(null_data == nullptr);
+    DCHECK(hashes.size() == size());
+    auto real_data = data->get_data_at(0);
+    if (real_data.data == nullptr) {
+        for (int i = 0; i < hashes.size(); ++i) {
+            hashes[i] = HashUtil::zlib_crc_hash_null(hashes[i]);
+        }
+    } else {
+        for (int i = 0; i < hashes.size(); ++i) {
+            hashes[i] = RawValue::zlib_crc32(real_data.data, real_data.size, 
TypeDescriptor {type},
+                                             hashes[i]);
+        }
+    }
+}
+
 MutableColumns ColumnConst::scatter(ColumnIndex num_columns, const Selector& 
selector) const {
     if (s != selector.size()) {
         LOG(FATAL) << fmt::format("Size of selector ({}) doesn't match size of 
column ({})",
diff --git a/be/src/vec/columns/column_const.h 
b/be/src/vec/columns/column_const.h
index be7f56ab23..422316075b 100644
--- a/be/src/vec/columns/column_const.h
+++ b/be/src/vec/columns/column_const.h
@@ -135,6 +135,9 @@ public:
     void update_hashes_with_value(std::vector<SipHash>& hashes,
                                   const uint8_t* __restrict null_data) const 
override;
 
+    void update_crcs_with_value(std::vector<uint32_t>& hashes, PrimitiveType 
type,
+                                const uint8_t* __restrict null_data) const 
override;
+
     ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const 
override;
     ColumnPtr replicate(const Offsets& offsets) const override;
     void replicate(const uint32_t* counts, size_t target_size, IColumn& 
column) const override;
diff --git a/be/src/vec/columns/column_decimal.cpp 
b/be/src/vec/columns/column_decimal.cpp
index ea904c8c30..9b6470a371 100644
--- a/be/src/vec/columns/column_decimal.cpp
+++ b/be/src/vec/columns/column_decimal.cpp
@@ -127,6 +127,39 @@ void 
ColumnDecimal<T>::update_hashes_with_value(std::vector<SipHash>& hashes,
     SIP_HASHES_FUNCTION_COLUMN_IMPL();
 }
 
+template <typename T>
+void ColumnDecimal<T>::update_crcs_with_value(std::vector<uint32_t>& hashes, 
PrimitiveType type,
+                                              const uint8_t* __restrict 
null_data) const {
+    auto s = hashes.size();
+    DCHECK(s == size());
+
+    if constexpr (!std::is_same_v<T, Decimal128>) {
+        DO_CRC_HASHES_FUNCTION_COLUMN_IMPL()
+    } else {
+        if (type == TYPE_DECIMALV2) {
+            auto decimalv2_do_crc = [&](size_t i) {
+                const DecimalV2Value& dec_val = (const DecimalV2Value&)data[i];
+                int64_t int_val = dec_val.int_value();
+                int32_t frac_val = dec_val.frac_value();
+                hashes[i] = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), 
hashes[i]);
+                hashes[i] = HashUtil::zlib_crc_hash(&frac_val, 
sizeof(frac_val), hashes[i]);
+            };
+
+            if (null_data == nullptr) {
+                for (size_t i = 0; i < s; i++) {
+                    decimalv2_do_crc(i);
+                }
+            } else {
+                for (size_t i = 0; i < s; i++) {
+                    if (null_data[i] == 0) decimalv2_do_crc(i);
+                }
+            }
+        } else {
+            DO_CRC_HASHES_FUNCTION_COLUMN_IMPL()
+        }
+    }
+}
+
 template <typename T>
 void ColumnDecimal<T>::get_permutation(bool reverse, size_t limit, int,
                                        IColumn::Permutation& res) const {
diff --git a/be/src/vec/columns/column_decimal.h 
b/be/src/vec/columns/column_decimal.h
index 81e037278c..41dba1827d 100644
--- a/be/src/vec/columns/column_decimal.h
+++ b/be/src/vec/columns/column_decimal.h
@@ -156,6 +156,8 @@ public:
     void update_hash_with_value(size_t n, SipHash& hash) const override;
     void update_hashes_with_value(std::vector<SipHash>& hash,
                                   const uint8_t* __restrict null_data) const 
override;
+    void update_crcs_with_value(std::vector<uint32_t>& hashes, PrimitiveType 
type,
+                                const uint8_t* __restrict null_data) const 
override;
 
     int compare_at(size_t n, size_t m, const IColumn& rhs_, int 
nan_direction_hint) const override;
     void get_permutation(bool reverse, size_t limit, int nan_direction_hint,
diff --git a/be/src/vec/columns/column_nullable.cpp 
b/be/src/vec/columns/column_nullable.cpp
index 4f18163479..29b1887421 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -68,6 +68,25 @@ void 
ColumnNullable::update_hashes_with_value(std::vector<SipHash>& hashes,
     }
 }
 
+void ColumnNullable::update_crcs_with_value(std::vector<uint32_t>& hashes,
+                                            doris::PrimitiveType type,
+                                            const uint8_t* __restrict 
null_data) const {
+    DCHECK(null_data == nullptr);
+    auto s = hashes.size();
+    DCHECK(s == size());
+    auto* __restrict real_null_data = assert_cast<const 
ColumnUInt8&>(*null_map).get_data().data();
+    if (!has_null()) {
+        nested_column->update_crcs_with_value(hashes, type, nullptr);
+    } else {
+        for (int i = 0; i < s; ++i) {
+            if (real_null_data[i] != 0) {
+                hashes[i] = HashUtil::zlib_crc_hash_null(hashes[i]);
+            }
+        }
+        nested_column->update_crcs_with_value(hashes, type, real_null_data);
+    }
+}
+
 MutableColumnPtr ColumnNullable::clone_resized(size_t new_size) const {
     MutableColumnPtr new_nested_col = 
get_nested_column().clone_resized(new_size);
     auto new_null_map = ColumnUInt8::create();
diff --git a/be/src/vec/columns/column_nullable.h 
b/be/src/vec/columns/column_nullable.h
index 2a8bf52dd2..523ff337ae 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -161,6 +161,8 @@ public:
     void update_hash_with_value(size_t n, SipHash& hash) const override;
     void update_hashes_with_value(std::vector<SipHash>& hashes,
                                   const uint8_t* __restrict null_data) const 
override;
+    void update_crcs_with_value(std::vector<uint32_t>& hash, PrimitiveType 
type,
+                                const uint8_t* __restrict null_data) const 
override;
     void get_extremes(Field& min, Field& max) const override;
 
     MutableColumns scatter(ColumnIndex num_columns, const Selector& selector) 
const override {
diff --git a/be/src/vec/columns/column_string.cpp 
b/be/src/vec/columns/column_string.cpp
index 18f0d74d23..20f0d3c534 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -104,6 +104,26 @@ void ColumnString::insert_indices_from(const IColumn& src, 
const int* indices_be
     }
 }
 
+void ColumnString::update_crcs_with_value(std::vector<uint32_t>& hashes, 
doris::PrimitiveType type,
+                                          const uint8_t* __restrict null_data) 
const {
+    auto s = hashes.size();
+    DCHECK(s == size());
+
+    if (null_data == nullptr) {
+        for (size_t i = 0; i < s; i++) {
+            auto data_ref = get_data_at(i);
+            hashes[i] = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, 
hashes[i]);
+        }
+    } else {
+        for (size_t i = 0; i < s; i++) {
+            if (null_data[i] == 0) {
+                auto data_ref = get_data_at(i);
+                hashes[i] = HashUtil::zlib_crc_hash(data_ref.data, 
data_ref.size, hashes[i]);
+            }
+        }
+    }
+}
+
 ColumnPtr ColumnString::filter(const Filter& filt, ssize_t result_size_hint) 
const {
     if (offsets.size() == 0) return ColumnString::create();
 
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index 8c504c6d08..ee8e6cfdd3 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -256,6 +256,9 @@ public:
         SIP_HASHES_FUNCTION_COLUMN_IMPL();
     }
 
+    void update_crcs_with_value(std::vector<uint32_t>& hashes, PrimitiveType 
type,
+                                const uint8_t* __restrict null_data) const 
override;
+
     void insert_range_from(const IColumn& src, size_t start, size_t length) 
override;
 
     void insert_indices_from(const IColumn& src, const int* indices_begin,
diff --git a/be/src/vec/columns/column_vector.cpp 
b/be/src/vec/columns/column_vector.cpp
index d60f8816a1..1678e5d7f4 100644
--- a/be/src/vec/columns/column_vector.cpp
+++ b/be/src/vec/columns/column_vector.cpp
@@ -26,7 +26,6 @@
 #include <cmath>
 #include <cstring>
 
-#include "runtime/datetime_value.h"
 #include "util/simd/bits.h"
 #include "vec/common/arena.h"
 #include "vec/common/assert_cast.h"
@@ -119,6 +118,38 @@ void ColumnVector<T>::sort_column(const ColumnSorter* 
sorter, EqualFlags& flags,
     sorter->template sort_column(static_cast<const Self&>(*this), flags, 
perms, range, last_column);
 }
 
+template <typename T>
+void ColumnVector<T>::update_crcs_with_value(std::vector<uint32_t>& hashes, 
PrimitiveType type,
+                                             const uint8_t* __restrict 
null_data) const {
+    auto s = hashes.size();
+    DCHECK(s == size());
+
+    if constexpr (!std::is_same_v<T, Int64>) {
+        DO_CRC_HASHES_FUNCTION_COLUMN_IMPL()
+    } else {
+        if (type == TYPE_DATE || type == TYPE_DATETIME) {
+            char buf[64];
+            auto date_convert_do_crc = [&](size_t i) {
+                const DateTimeValue& date_val = (const DateTimeValue&)data[i];
+                auto len = date_val.to_buffer(buf);
+                hashes[i] = HashUtil::zlib_crc_hash(buf, len, hashes[i]);
+            };
+
+            if (null_data == nullptr) {
+                for (size_t i = 0; i < s; i++) {
+                    date_convert_do_crc(i);
+                }
+            } else {
+                for (size_t i = 0; i < s; i++) {
+                    if (null_data[i] == 0) date_convert_do_crc(i);
+                }
+            }
+        } else {
+            DO_CRC_HASHES_FUNCTION_COLUMN_IMPL()
+        }
+    }
+}
+
 template <typename T>
 struct ColumnVector<T>::less {
     const Self& parent;
diff --git a/be/src/vec/columns/column_vector.h 
b/be/src/vec/columns/column_vector.h
index 3a8ec82382..447886d08f 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -250,6 +250,9 @@ public:
     void update_hashes_with_value(std::vector<SipHash>& hashes,
                                   const uint8_t* __restrict null_data) const 
override;
 
+    void update_crcs_with_value(std::vector<uint32_t>& hashes, PrimitiveType 
type,
+                                const uint8_t* __restrict null_data) const 
override;
+
     size_t byte_size() const override { return data.size() * sizeof(data[0]); }
 
     size_t allocated_bytes() const override { return data.allocated_bytes(); }
diff --git a/be/src/vec/sink/vdata_stream_sender.cpp 
b/be/src/vec/sink/vdata_stream_sender.cpp
index 679890a60c..03482ed3b6 100644
--- a/be/src/vec/sink/vdata_stream_sender.cpp
+++ b/be/src/vec/sink/vdata_stream_sender.cpp
@@ -536,24 +536,12 @@ Status VDataStreamSender::send(RuntimeState* state, 
Block* block) {
         // vectorized calculate hash val
         int rows = block->rows();
         // for each row, we have a hash_val
-        std::vector<size_t> hash_vals(rows);
+        std::vector<uint32_t> hash_vals(rows);
 
         // result[j] means column index, i means rows index
         for (int j = 0; j < result_size; ++j) {
-            auto& column = block->get_by_position(result[j]).column;
-            for (int i = 0; i < rows; ++i) {
-                auto val = column->get_data_at(i);
-                if (val.data == nullptr) {
-                    // nullptr is treat as 0 when hash
-                    static const int INT_VALUE = 0;
-                    static const TypeDescriptor INT_TYPE(TYPE_INT);
-                    hash_vals[i] = RawValue::zlib_crc32(&INT_VALUE, INT_TYPE, 
hash_vals[i]);
-                } else {
-                    hash_vals[i] = RawValue::zlib_crc32(val.data, val.size,
-                                                        
_partition_expr_ctxs[j]->root()->type(),
-                                                        hash_vals[i]);
-                }
-            }
+            block->get_by_position(result[j]).column->update_crcs_with_value(
+                    hash_vals, _partition_expr_ctxs[j]->root()->type().type);
         }
 
         Block::erase_useless_column(block, column_to_keep);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java
index 91a17cd20c..2bdb3e43ac 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java
@@ -40,6 +40,7 @@ import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.sql.Timestamp;
 import java.time.LocalDateTime;
 import java.time.Year;
@@ -495,14 +496,27 @@ public class DateLiteral extends LiteralExpr {
     // Date column and Datetime column's hash value is not same.
     @Override
     public ByteBuffer getHashValue(PrimitiveType type) {
-        // This hash value should be computed using new String since precision 
is introduced to datetime.
-        // But it is hard to keep compatibility. So I don't change this 
function here.
-        String value = convertToString(type);
         ByteBuffer buffer;
-        try {
-            buffer = ByteBuffer.wrap(value.getBytes("UTF-8"));
-        } catch (Exception e) {
-            throw new RuntimeException(e);
+        if (type == PrimitiveType.DATEV2) {
+            int value = (int) ((year << 9) | (month << 5) | day);
+            buffer = ByteBuffer.allocate(4);
+            buffer.order(ByteOrder.LITTLE_ENDIAN);
+            buffer.putInt(value);
+        } else if (type == PrimitiveType.DATETIMEV2) {
+            long value =  (year << 50) | (month << 46) | (day << 41) | (hour 
<< 36)
+                    | (minute << 30) | (second << 24) | microsecond;
+            buffer = ByteBuffer.allocate(8);
+            buffer.order(ByteOrder.LITTLE_ENDIAN);
+            buffer.putLong(value);
+        } else {
+            // This hash value should be computed using new String since 
precision is introduced to datetime.
+            // But it is hard to keep compatibility. So I don't change this 
function here.
+            String value = convertToString(type);
+            try {
+                buffer = ByteBuffer.wrap(value.getBytes("UTF-8"));
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
         }
         return buffer;
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/DecimalLiteral.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/DecimalLiteral.java
index 1c013cf5f1..53ca2ba9ca 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DecimalLiteral.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DecimalLiteral.java
@@ -159,9 +159,6 @@ public class DecimalLiteral extends LiteralExpr {
                 buffer.putLong(value.longValue());
                 break;
             case DECIMALV2:
-            case DECIMAL32:
-            case DECIMAL64:
-            case DECIMAL128:
                 buffer = ByteBuffer.allocate(12);
                 buffer.order(ByteOrder.LITTLE_ENDIAN);
 
@@ -170,6 +167,19 @@ public class DecimalLiteral extends LiteralExpr {
                 buffer.putLong(integerValue);
                 buffer.putInt(fracValue);
                 break;
+            case DECIMAL32:
+                buffer = ByteBuffer.allocate(4);
+                buffer.order(ByteOrder.LITTLE_ENDIAN);
+                buffer.putInt(value.unscaledValue().intValue());
+                break;
+            case DECIMAL64:
+                buffer = ByteBuffer.allocate(8);
+                buffer.order(ByteOrder.LITTLE_ENDIAN);
+                buffer.putLong(value.unscaledValue().longValue());
+                break;
+            case DECIMAL128:
+                LargeIntLiteral tmp = new 
LargeIntLiteral(value.unscaledValue());
+                return tmp.getHashValue(type);
             default:
                 return super.getHashValue(type);
         }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/LargeIntLiteral.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/LargeIntLiteral.java
index 7cb87669a4..fde708b05b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/LargeIntLiteral.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/LargeIntLiteral.java
@@ -62,6 +62,12 @@ public class LargeIntLiteral extends LiteralExpr {
         analysisDone();
     }
 
+    public LargeIntLiteral(BigInteger v) {
+        super();
+        type = Type.LARGEINT;
+        value = v;
+    }
+
     public LargeIntLiteral(String value) throws AnalysisException {
         super();
         BigInteger bigInt;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to