dataroaring commented on code in PR #13785:
URL: https://github.com/apache/doris/pull/13785#discussion_r1018098498


##########
be/src/vec/aggregate_functions/aggregate_function_sequence_match.h:
##########
@@ -0,0 +1,712 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file is copied from
+// 
https://github.com/ClickHouse/ClickHouse/blob/master/AggregateFunctionSequenceMatch.h
+// and modified by Doris
+
+#pragma once
+
+#include <bitset>
+
+#include "common/logging.h"
+#include "vec/aggregate_functions/aggregate_function.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/columns_number.h"
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/io/io_helper.h"
+#include "vec/io/var_int.h"
+
+namespace doris::vectorized {
+
+namespace ErrorCodes {
+extern const int TOO_SLOW;
+extern const int SYNTAX_ERROR;
+extern const int BAD_ARGUMENTS;
+extern const int LOGICAL_ERROR;

Review Comment:
   Useless code?



##########
docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/sequence_count.md:
##########
@@ -0,0 +1,261 @@
+---
+{
+    "title": "SEQUENCE-COUNT",
+    "language": "zh-CN"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## SEQUENCE-COUNT
+### Description
+#### Syntax
+
+`sequence_count(pattern, timestamp, cond1, cond2, ...);`
+
+计算与模式匹配的事件链的数量。该函数搜索不重叠的事件链。当前链匹配后,它开始搜索下一个链。
+
+**警告!** 
+
+在同一秒钟发生的事件可能以未定义的顺序排列在序列中,影响最终结果。
+
+#### Arguments
+
+`pattern` — 模式字符串.
+
+**模式语法**
+
+`(?N)` — 在位置N匹配条件参数。 条件在编号 `[1, 32]` 范围。 例如, `(?1)` 匹配传递给 `cond1` 参数。
+
+`.*` — 匹配任何事件的数字。 不需要条件参数来匹配这个模式。
+
+`(?t operator value)` — 分开两个事件的时间。 我们将时间直接转化为对应的数字来描述时间。
+
+例如
+
+```
+2022-11-02 10:41:00 -> 20221102104100
+2022-11-02 11:41:00 -> 20221102114100
+```
+
+`t`表示为两个时间的直接插值。 例如: `(?1)(?t>1800)(?2)` 匹配彼此发生超过18分钟的事件, 
`(?1)(?t>10000)(?2)`匹配彼此发生超过1小时的事件。 这些事件之间可以存在任意数量的任何事件。 您可以使用 `>=`, `>`, `<`, 
`<=`, `==` 运算符。
+
+`timestamp` —  包含时间的列。典型的时间类型是: `Date` 和 `DateTime`。您还可以使用任何支持的 `UInt` 数据类型。

Review Comment:
   典型的时间类型是: `Date` 和 `DateTime`,也可以使用任何支持的 `UInt` 数据类型。



##########
be/src/vec/aggregate_functions/aggregate_function_sequence_match.h:
##########
@@ -0,0 +1,712 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file is copied from
+// 
https://github.com/ClickHouse/ClickHouse/blob/master/AggregateFunctionSequenceMatch.h
+// and modified by Doris
+
+#pragma once
+
+#include <bitset>
+
+#include "common/logging.h"
+#include "vec/aggregate_functions/aggregate_function.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/columns_number.h"
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/io/io_helper.h"
+#include "vec/io/var_int.h"
+
+namespace doris::vectorized {
+
+namespace ErrorCodes {
+extern const int TOO_SLOW;
+extern const int SYNTAX_ERROR;
+extern const int BAD_ARGUMENTS;
+extern const int LOGICAL_ERROR;
+} // namespace ErrorCodes
+
+template <template <typename> class Comparator>
+struct ComparePairFirst final {
+    template <typename T1, typename T2>
+    bool operator()(const std::pair<T1, T2>& lhs, const std::pair<T1, T2>& 
rhs) const {
+        return Comparator<T1> {}(lhs.first, rhs.first);
+    }
+};
+
+static constexpr size_t max_events = 32;
+
+/// Max number of iterations to match the pattern against a sequence, 
exception thrown when exceeded
+constexpr auto sequence_match_max_iterations = 1000000;
+
+template <typename DateValueType, typename NativeType, typename Derived>
+struct AggregateFunctionSequenceMatchData final {
+    using Timestamp = DateValueType;
+    using Events = std::bitset<max_events>;
+    using TimestampEvents = std::pair<Timestamp, Events>;
+    using Comparator = ComparePairFirst<std::less>;
+
+    AggregateFunctionSequenceMatchData() { reset(); }
+
+public:
+    const std::string get_pattern() const { return pattern; }
+
+    size_t get_arg_count() const { return arg_count; }
+
+    void init(const std::string pattern, size_t arg_count) {
+        if (!init_flag) {
+            this->pattern = pattern;
+            this->arg_count = arg_count;
+            parsePattern();
+            init_flag = true;
+        }
+    }
+
+    void reset() {
+        sorted = true;
+        init_flag = false;
+        pattern_has_time = false;
+        pattern = "";
+        arg_count = 0;
+        conditions_met.reset();
+        conditions_in_pattern.reset();
+
+        events_list.clear();
+        actions.clear();
+        dfa_states.clear();
+    }
+
+    void add(const Timestamp& timestamp, const Events& events) {
+        /// store information exclusively for rows with at least one event
+        if (events.any()) {
+            events_list.emplace_back(timestamp, events);
+            sorted = false;
+            conditions_met |= events;
+        }
+    }
+
+    void merge(const AggregateFunctionSequenceMatchData& other) {
+        if (other.events_list.empty()) return;
+
+        events_list.insert(std::begin(other.events_list), 
std::end(other.events_list));
+        sorted = false;
+        conditions_met |= other.conditions_met;
+    }
+
+    void sort() {
+        if (sorted) return;
+
+        std::sort(std::begin(events_list), std::end(events_list), Comparator 
{});
+        sorted = true;
+    }
+
+    void write(BufferWritable& buf) const {
+        write_binary(sorted, buf);
+        write_binary(events_list.size(), buf);
+
+        for (const auto& events : events_list) {
+            write_binary(events.first, buf);
+            write_binary(events.second.to_ulong(), buf);
+        }
+
+        UInt32 conditions_met_value = conditions_met.to_ulong();
+        write_binary(conditions_met_value, buf);
+
+        write_binary(pattern, buf);
+        write_binary(arg_count, buf);
+    }
+
+    void read(BufferReadable& buf) {
+        read_binary(sorted, buf);
+
+        size_t events_list_size;
+        read_binary(events_list_size, buf);
+
+        events_list.clear();
+        events_list.reserve(events_list_size);
+
+        for (size_t i = 0; i < events_list_size; ++i) {
+            Timestamp timestamp;
+            read_binary(timestamp, buf);
+
+            UInt64 events;
+            read_binary(events, buf);
+
+            events_list.emplace_back(timestamp, Events {events});
+        }
+
+        UInt32 conditions_met_value;
+        read_binary(conditions_met_value, buf);
+        conditions_met = conditions_met_value;
+
+        read_binary(pattern, buf);
+        read_binary(arg_count, buf);
+    }
+
+private:
+    enum class PatternActionType {
+        SpecificEvent,
+        AnyEvent,
+        KleeneStar,
+        TimeLessOrEqual,
+        TimeLess,
+        TimeGreaterOrEqual,
+        TimeGreater,
+        TimeEqual
+    };
+
+    struct PatternAction final {
+        PatternActionType type;
+        std::uint64_t extra;
+
+        PatternAction() = default;
+        explicit PatternAction(const PatternActionType type_, const 
std::uint64_t extra_ = 0)
+                : type {type_}, extra {extra_} {}
+    };
+
+    using PatternActions = PODArrayWithStackMemory<PatternAction, 64>;
+
+    Derived& derived() { return static_cast<Derived&>(*this); }
+
+    void parsePattern() {
+        actions.clear();
+        actions.emplace_back(PatternActionType::KleeneStar);
+
+        dfa_states.clear();
+        dfa_states.emplace_back(true);
+
+        pattern_has_time = false;
+
+        const char* pos = pattern.data();
+        const char* begin = pos;
+        const char* end = pos + pattern.size();
+        auto throw_exception = [&](const std::string& msg) {
+            LOG(FATAL) << msg + " '" + std::string(pos, end) + "' at position 
" +
+                                  std::to_string(pos - begin);

Review Comment:
   Add comment.
   // Pattern is checked in fe, so pattern should be vaild here, we check it 
and if pattern is invalid, we return 0.



##########
be/src/vec/aggregate_functions/aggregate_function_sequence_match.h:
##########
@@ -0,0 +1,712 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file is copied from
+// 
https://github.com/ClickHouse/ClickHouse/blob/master/AggregateFunctionSequenceMatch.h
+// and modified by Doris
+
+#pragma once
+
+#include <bitset>
+
+#include "common/logging.h"
+#include "vec/aggregate_functions/aggregate_function.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/columns_number.h"
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/io/io_helper.h"
+#include "vec/io/var_int.h"
+
+namespace doris::vectorized {
+
+namespace ErrorCodes {
+extern const int TOO_SLOW;
+extern const int SYNTAX_ERROR;
+extern const int BAD_ARGUMENTS;
+extern const int LOGICAL_ERROR;
+} // namespace ErrorCodes
+
+template <template <typename> class Comparator>
+struct ComparePairFirst final {
+    template <typename T1, typename T2>
+    bool operator()(const std::pair<T1, T2>& lhs, const std::pair<T1, T2>& 
rhs) const {
+        return Comparator<T1> {}(lhs.first, rhs.first);
+    }
+};
+
+static constexpr size_t max_events = 32;
+
+/// Max number of iterations to match the pattern against a sequence, 
exception thrown when exceeded
+constexpr auto sequence_match_max_iterations = 1000000;
+
+template <typename DateValueType, typename NativeType, typename Derived>
+struct AggregateFunctionSequenceMatchData final {
+    using Timestamp = DateValueType;
+    using Events = std::bitset<max_events>;
+    using TimestampEvents = std::pair<Timestamp, Events>;
+    using Comparator = ComparePairFirst<std::less>;
+
+    AggregateFunctionSequenceMatchData() { reset(); }
+
+public:
+    const std::string get_pattern() const { return pattern; }
+
+    size_t get_arg_count() const { return arg_count; }
+
+    void init(const std::string pattern, size_t arg_count) {
+        if (!init_flag) {
+            this->pattern = pattern;
+            this->arg_count = arg_count;
+            parsePattern();
+            init_flag = true;
+        }
+    }
+
+    void reset() {
+        sorted = true;
+        init_flag = false;
+        pattern_has_time = false;
+        pattern = "";
+        arg_count = 0;
+        conditions_met.reset();
+        conditions_in_pattern.reset();
+
+        events_list.clear();
+        actions.clear();
+        dfa_states.clear();
+    }
+
+    void add(const Timestamp& timestamp, const Events& events) {
+        /// store information exclusively for rows with at least one event
+        if (events.any()) {
+            events_list.emplace_back(timestamp, events);
+            sorted = false;
+            conditions_met |= events;
+        }
+    }
+
+    void merge(const AggregateFunctionSequenceMatchData& other) {
+        if (other.events_list.empty()) return;
+
+        events_list.insert(std::begin(other.events_list), 
std::end(other.events_list));
+        sorted = false;
+        conditions_met |= other.conditions_met;
+    }
+
+    void sort() {
+        if (sorted) return;
+
+        std::sort(std::begin(events_list), std::end(events_list), Comparator 
{});
+        sorted = true;
+    }
+
+    void write(BufferWritable& buf) const {
+        write_binary(sorted, buf);
+        write_binary(events_list.size(), buf);
+
+        for (const auto& events : events_list) {
+            write_binary(events.first, buf);
+            write_binary(events.second.to_ulong(), buf);
+        }
+
+        UInt32 conditions_met_value = conditions_met.to_ulong();
+        write_binary(conditions_met_value, buf);
+
+        write_binary(pattern, buf);
+        write_binary(arg_count, buf);
+    }
+
+    void read(BufferReadable& buf) {
+        read_binary(sorted, buf);
+
+        size_t events_list_size;
+        read_binary(events_list_size, buf);
+
+        events_list.clear();
+        events_list.reserve(events_list_size);
+
+        for (size_t i = 0; i < events_list_size; ++i) {
+            Timestamp timestamp;
+            read_binary(timestamp, buf);
+
+            UInt64 events;
+            read_binary(events, buf);
+
+            events_list.emplace_back(timestamp, Events {events});
+        }
+
+        UInt32 conditions_met_value;
+        read_binary(conditions_met_value, buf);
+        conditions_met = conditions_met_value;
+
+        read_binary(pattern, buf);
+        read_binary(arg_count, buf);
+    }
+
+private:
+    enum class PatternActionType {
+        SpecificEvent,
+        AnyEvent,
+        KleeneStar,
+        TimeLessOrEqual,
+        TimeLess,
+        TimeGreaterOrEqual,
+        TimeGreater,
+        TimeEqual
+    };
+
+    struct PatternAction final {
+        PatternActionType type;
+        std::uint64_t extra;
+
+        PatternAction() = default;
+        explicit PatternAction(const PatternActionType type_, const 
std::uint64_t extra_ = 0)
+                : type {type_}, extra {extra_} {}
+    };
+
+    using PatternActions = PODArrayWithStackMemory<PatternAction, 64>;
+
+    Derived& derived() { return static_cast<Derived&>(*this); }
+
+    void parsePattern() {
+        actions.clear();
+        actions.emplace_back(PatternActionType::KleeneStar);
+
+        dfa_states.clear();
+        dfa_states.emplace_back(true);
+
+        pattern_has_time = false;
+
+        const char* pos = pattern.data();
+        const char* begin = pos;
+        const char* end = pos + pattern.size();
+        auto throw_exception = [&](const std::string& msg) {
+            LOG(FATAL) << msg + " '" + std::string(pos, end) + "' at position 
" +
+                                  std::to_string(pos - begin);

Review Comment:
   Do not LOG(FATAL) which would crash be.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to