YuweiXiao commented on a change in pull request #3173: URL: https://github.com/apache/hudi/pull/3173#discussion_r759777178
########## File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java ########## @@ -200,6 +209,48 @@ .defaultValue("true") .withDocumentation("Similar to " + BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE + ", but for simple index."); + /** + * ***** Bucket Index Configs ***** + * Bucket Index is targeted to locate the record fast by hash in big data scenarios. + * The current implementation is a basic version, so there are some constraints: + * 1. Unsupported operation: bulk insert, cluster and so on. + * 2. Bucket num change requires rewriting the partition. + * 3. Predict the table size and future data growth well to set a reasonable bucket num. + * 4. A bucket size is recommended less than 3GB and avoid bing too small. + */ + // Bucket num equals file groups num in each partition. + // Bucket num can be set according to partition size and file group size. + public static final ConfigProperty<Integer> BUCKET_INDEX_NUM_BUCKETS = ConfigProperty + .key("hoodie.bucket.index.num.buckets") + .defaultValue(256) + .withDocumentation("Only applies if index type is BUCKET_INDEX. Determine the bucket num of the hudi table, " + + "and each partition is divided to N buckets."); + + public static final ConfigProperty<String> BUCKET_INDEX_HASH_FIELD = ConfigProperty + .key("hoodie.bucket.index.hash.field") + .noDefaultValue() + .withDocumentation("Index key. It is used to index the record and find its file group. " + + "If not set, use record key field as default"); + + public static final ConfigProperty<String> BUCKET_INDEX_HASH_FUNCTION = ConfigProperty + .key("hoodie.bucket.index.hash.function") + .defaultValue("JVMHash") + .withDocumentation("Hash function. It is used to compute the index key hash value " + + "Possible options are [JVMHash | HiveHash]. "); + + public static final Set<WriteOperationType> BUCKET_INDEX_SUPPORTED_OPERATIONS = new HashSet<WriteOperationType>() {{ + add(WriteOperationType.INSERT); Review comment: The current implementation seems not support insert. Two consecutive insertions (with records going into same bucket) will produce two different parquet files with the same bucketId. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org