xiaokang commented on code in PR #11579:
URL: https://github.com/apache/doris/pull/11579#discussion_r944356288


##########
fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java:
##########
@@ -66,6 +93,19 @@ public void analyze() throws AnalysisException {
             if (columns.size() != distinct.size()) {
                 throw new AnalysisException("columns of index has 
duplicated.");
             }
+            if (arguments != null && !arguments.isEmpty()) {
+                throw new AnalysisException("bimap index do not need 
arguments.");
+            }
+        } else if (indexType == IndexType.NGRAM_BF) {
+            if (columns == null || columns.size() != 1) {

Review Comment:
   common check for indexName can be moved to the outer scope.



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java:
##########
@@ -142,6 +217,30 @@ public void checkColumn(Column column, KeysType keysType) 
throws AnalysisExcepti
                         "BITMAP index only used in columns of 
DUP_KEYS/UNIQUE_KEYS table or key columns of"
                                 + " AGG_KEYS table. invalid column: " + 
indexColName);
             }
+        } else if (indexType == IndexType.NGRAM_BF) {
+            String indexColName = column.getName();
+            PrimitiveType colType = column.getDataType();
+            if (colType != PrimitiveType.CHAR && colType != 
PrimitiveType.VARCHAR) {

Review Comment:
   colType.isStringType() may be better, if String is also supported.



##########
docs/zh-CN/docs/data-table/index/ngram-bloomfilter-index.md:
##########
@@ -0,0 +1,79 @@
+---
+{
+    "title": "NGram BloomFilter索引",
+    "language": "zh-CN"
+}
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Doris NGram BloomFilter索引及使用使用场景
+
+为了提升like的查询性能,增加了NGram BloomFilter索引,其实现主要参照了ClickHouse的ngrambf。
+
+## NGram BloomFilter创建
+
+表创建时指定:
+
+```sql
+CREATE TABLE `table3` (
+  `siteid` int(11) NULL DEFAULT "10" COMMENT "",
+  `citycode` smallint(6) NULL COMMENT "",
+  `username` varchar(32) NULL DEFAULT "" COMMENT "",
+  INDEX idx_ngrambf (`username`) USING NGRAM_BF (3,256) COMMENT 'username 
ngram_bf index'
+) ENGINE=OLAP
+AGGREGATE KEY(`siteid`, `citycode`, `username`) COMMENT "OLAP"
+DISTRIBUTED BY HASH(`siteid`) BUCKETS 10
+PROPERTIES (
+"replication_num" = "1"
+);
+
+-- 其中(3,256),分别表示ngram的个数和bloomfilter的字节数。
+```
+
+## 查看NGram BloomFilter索引
+
+查看我们在表上建立的NGram BloomFilter索引是使用:
+
+```sql
+show index from example_db.table3;
+```
+
+## 删除NGram BloomFilter索引
+
+
+```sql
+alter table example_db.table3 drop index idx_ngrambf;
+```
+
+## 修改NGram BloomFilter索引
+
+为已有列新增NGram BloomFilter索引:
+
+```sql
+alter table example_db.table3 add index idx_ngrambf(username) using 
NGRAM_BF(3, 256) comment 'username ngram_bf index' 
+```
+
+## **Doris NGram BloomFilter使用注意事项**
+
+1. NGram BloomFilter只支持字符串列
+2. NGram BloomFilter索引和BloomFilter索引为互斥关系,即同一个列只能设置两者中的一个

Review Comment:
   Can we support normal BloomFilter ability in NgramBloomFilter?  It may be 
achived by adding the whole filed value as a token to the bloom filter.



##########
fe/fe-core/src/main/cup/sql_parser.cup:
##########
@@ -518,7 +518,9 @@ nonterminal ColumnDef.DefaultValue opt_default_value;
 nonterminal Boolean opt_if_exists, opt_if_not_exists;
 nonterminal Boolean opt_external;
 nonterminal Boolean opt_force;
-nonterminal IndexDef.IndexType opt_index_type;
+nonterminal IndexDef.IndexType index_type;

Review Comment:
   can still be opt_index_type if bitmap index is kept as default 



##########
be/src/olap/rowset/segment_v2/column_writer.cpp:
##########
@@ -296,8 +296,13 @@ Status ScalarColumnWriter::init() {
         RETURN_IF_ERROR(
                 BitmapIndexWriter::create(get_field()->type_info(), 
&_bitmap_index_builder));
     }
+
     if (_opts.need_bloom_filter) {
-        RETURN_IF_ERROR(BloomFilterIndexWriter::create(
+        if (_opts.is_ngram_bf_index)
+            RETURN_IF_ERROR(BloomFilterIndexWriter::create(

Review Comment:
   using NGramBloomFilterIndexWriterImpl directly may be more simple and 
intuitive



##########
docs/zh-CN/docs/data-table/index/ngram-bloomfilter-index.md:
##########
@@ -0,0 +1,79 @@
+---
+{
+    "title": "NGram BloomFilter索引",
+    "language": "zh-CN"
+}
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Doris NGram BloomFilter索引及使用使用场景
+
+为了提升like的查询性能,增加了NGram BloomFilter索引,其实现主要参照了ClickHouse的ngrambf。
+
+## NGram BloomFilter创建
+
+表创建时指定:
+
+```sql
+CREATE TABLE `table3` (
+  `siteid` int(11) NULL DEFAULT "10" COMMENT "",
+  `citycode` smallint(6) NULL COMMENT "",
+  `username` varchar(32) NULL DEFAULT "" COMMENT "",
+  INDEX idx_ngrambf (`username`) USING NGRAM_BF (3,256) COMMENT 'username 
ngram_bf index'
+) ENGINE=OLAP
+AGGREGATE KEY(`siteid`, `citycode`, `username`) COMMENT "OLAP"
+DISTRIBUTED BY HASH(`siteid`) BUCKETS 10
+PROPERTIES (
+"replication_num" = "1"
+);
+
+-- 其中(3,256),分别表示ngram的个数和bloomfilter的字节数。

Review Comment:
   It's helpful for users to provide some suggestion for how to determin the 
size according to data distribution and error rate expectation.



##########
fe/fe-core/src/main/cup/sql_parser.cup:
##########
@@ -240,7 +240,7 @@ parser code {:
 
 // Total keywords of doris
 terminal String KW_ADD, KW_ADMIN, KW_AFTER, KW_AGGREGATE, KW_ALIAS, KW_ALL, 
KW_ALTER, KW_AND, KW_ANTI, KW_APPEND, KW_AS, KW_ASC, KW_AUTHORS, KW_ARRAY,
-    KW_BACKEND, KW_BACKUP, KW_BETWEEN, KW_BEGIN, KW_BIGINT, KW_BINLOG, 
KW_BITMAP, KW_BITMAP_UNION, KW_QUANTILE_STATE, KW_QUANTILE_UNION, KW_BLOB, 
KW_BOOLEAN, KW_BROKER, KW_BACKENDS, KW_BY, KW_BUILTIN,
+    KW_BACKEND, KW_BACKUP, KW_BETWEEN, KW_BEGIN, KW_BIGINT, KW_BINLOG, 
KW_BITMAP, KW_BITMAP_UNION,KW_NGRAM_BF, KW_QUANTILE_STATE, KW_QUANTILE_UNION, 
KW_BLOB, KW_BOOLEAN, KW_BROKER, KW_BACKENDS, KW_BY, KW_BUILTIN,

Review Comment:
   missed a whitespace



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java:
##########
@@ -142,6 +217,30 @@ public void checkColumn(Column column, KeysType keysType) 
throws AnalysisExcepti
                         "BITMAP index only used in columns of 
DUP_KEYS/UNIQUE_KEYS table or key columns of"
                                 + " AGG_KEYS table. invalid column: " + 
indexColName);
             }
+        } else if (indexType == IndexType.NGRAM_BF) {
+            String indexColName = column.getName();
+            PrimitiveType colType = column.getDataType();
+            if (colType != PrimitiveType.CHAR && colType != 
PrimitiveType.VARCHAR) {
+                throw new AnalysisException(colType + " is not supported in 
ngram_bf index. "
+                        + "invalid column: " + indexColName);
+            } else if ((keysType == KeysType.AGG_KEYS && !column.isKey())) {
+                throw new AnalysisException(
+                        "ngram_bf index only used in columns of 
DUP_KEYS/UNIQUE_KEYS table or key columns of"
+                                + " AGG_KEYS table. invalid column: " + 
indexColName);
+            }
+            if (arguments == null || arguments.size() != 2) {
+                throw new AnalysisException("ngram should have ngram size and 
bloom filter size arguments");
+            }
+            Expr ngramSize = arguments.get(0);
+            if (!(ngramSize instanceof IntLiteral && ((IntLiteral) 
ngramSize).getLongValue() < 256
+                    && ((IntLiteral) ngramSize).getLongValue() >= 1)) {
+                throw new AnalysisException("ngram size should be integer and 
less than 256");
+            }
+            Expr bfSize = arguments.get(1);
+            if (!(bfSize instanceof IntLiteral && ((IntLiteral) 
bfSize).getLongValue() < 65536

Review Comment:
   is hardcoded limit 64K too strict?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to