FrankChen021 commented on code in PR #19579: URL: https://github.com/apache/druid/pull/19579#discussion_r3413389507
########## processing/src/main/java/org/apache/druid/data/input/impl/ClusteredValueGroupsBaseTableProjectionSpec.java: ########## @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.data.input.impl; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; +import com.google.common.collect.Sets; +import org.apache.druid.error.InvalidInput; +import org.apache.druid.query.OrderBy; +import org.apache.druid.query.aggregation.AggregatorFactory; +import org.apache.druid.segment.VirtualColumns; +import org.apache.druid.segment.column.ColumnHolder; + +import javax.annotation.Nullable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +/** + * {@link BaseTableProjectionSpec} for the clustered-value-groups base table mode: rows are partitioned into per-tuple + * "cluster groups" keyed by one or more typed clustering columns, optionally derived from {@link #virtualColumns}. + * Essentially, each group is stored as its own internal sub-segment.. + * <p> + * The operator declares (1) the typed clustering columns (as ordinary {@link DimensionSchema}s, they're real + * dimensions, just stored differently), (2) optional virtual columns, and (3) the non-clustering + * dimensions and metrics. {@link #getDimensionsSpec()} returns the unified spec in canonical order + * (clustering dims first, then non-clustering); {@link #getOrdering()} is either the declared ordering or, by + * default, all clustering dims ascending followed by {@code __time} ascending. + * <p> + * A clustered base table is never rollup. Query granularity, when wanted, is just another entry in + * {@link #getVirtualColumns()} named + * {@link org.apache.druid.java.util.common.granularity.Granularities#GRANULARITY_VIRTUAL_COLUMN_NAME}; absent that + * virtual column the query granularity is {@code NONE}. Segment granularity and intervals live on the top-level + * {@link org.apache.druid.indexer.granularity.SegmentGranularitySpec}, not here. + */ +@JsonTypeName(ClusteredValueGroupsBaseTableProjectionSpec.TYPE_NAME) +public final class ClusteredValueGroupsBaseTableProjectionSpec implements BaseTableProjectionSpec +{ + public static final String TYPE_NAME = "clusteredValueGroups"; + + private final VirtualColumns virtualColumns; + private final List<DimensionSchema> clusteringColumns; + private final List<DimensionSchema> nonClusteringDimensions; + private final AggregatorFactory[] metrics; + private final DimensionsSpec dimensionsSpec; + private final List<OrderBy> ordering; + + public static Builder builder() + { + return new Builder(); + } + + @JsonCreator + public ClusteredValueGroupsBaseTableProjectionSpec( + @JsonProperty("virtualColumns") @Nullable VirtualColumns virtualColumns, + @JsonProperty("clusteringColumns") List<DimensionSchema> clusteringColumns, + @JsonProperty("dimensions") @Nullable List<DimensionSchema> nonClusteringDimensions, + @JsonProperty("metrics") @Nullable AggregatorFactory[] metrics, + @JsonProperty("ordering") @Nullable List<OrderBy> declaredOrdering + ) + { + if (clusteringColumns == null || clusteringColumns.isEmpty()) { + throw InvalidInput.exception("clusteringColumns must be non-empty for clusteredValueGroups base table"); + } + this.clusteringColumns = Collections.unmodifiableList(new ArrayList<>(clusteringColumns)); + this.virtualColumns = virtualColumns == null ? VirtualColumns.EMPTY : virtualColumns; + this.nonClusteringDimensions = nonClusteringDimensions == null + ? Collections.emptyList() + : Collections.unmodifiableList(new ArrayList<>(nonClusteringDimensions)); + this.metrics = metrics == null ? new AggregatorFactory[0] : metrics; + validateNoOverlap(this.clusteringColumns, this.nonClusteringDimensions); + this.dimensionsSpec = computeDimensionsSpec(this.clusteringColumns, this.nonClusteringDimensions); + this.ordering = declaredOrdering != null Review Comment: [P2] Declared ordering can be advertised without being honored The spec accepts any declaredOrdering here, but the clustered write path does not use it to build the actual per-group row comparator: OnHeapClusterGroup derives ordering from timePosition/DimensionsSpec, while IndexMergerV10 emits groups in ascending tuple order and then stores firstSchema.getOrdering() in metadata. A spec such as [tenant ASC, page ASC, __time ASC], or any DESC order, can therefore be persisted with advertised ordering that the data does not actually satisfy. Query engines may trust CursorHolder/DataSegment ordering and skip sorting, producing misordered results. Please reject unsupported ordering or wire the declared order through ingestion and merge. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
