capistrant commented on code in PR #19061:
URL: https://github.com/apache/druid/pull/19061#discussion_r2956359385
##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/compact/CompactionSupervisorTest.java:
##########
@@ -575,6 +527,218 @@ public void
test_cascadingReindexing_withVirtualColumnOnNestedData_filtersCorrec
Assertions.assertTrue(count > 0);
}
+ @Test
+ public void test_compaction_cluster_by_virtualcolumn()
+ {
+ // Virtual Columns on nested data is only supported with MSQ compaction
engine right now.
+ CompactionEngine compactionEngine = CompactionEngine.MSQ;
+ configureCompaction(compactionEngine, null);
+
+ String jsonDataWithNestedColumn =
+ """
+ {"timestamp": "2023-01-01T00:00:00", "str":"a", "obj":{"a":
"ll"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"", "obj":{"a":
"mm"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"null", "obj":{"a":
"nn"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"b", "obj":{"a":
"oo"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"c", "obj":{"a":
"pp"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"d", "obj":{"a":
"qq"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":null, "obj":{"a":
"rr"}}
+ """;
+
+ final TaskBuilder.Index task = TaskBuilder
+ .ofTypeIndex()
+ .dataSource(dataSource)
+ .jsonInputFormat()
+ .inlineInputSourceWithData(jsonDataWithNestedColumn)
+ .isoTimestampColumn("timestamp")
+ .schemaDiscovery()
+ .granularitySpec("DAY", null, false);
+
+ cluster.callApi().runTask(task.withId(IdUtils.getRandomId()), overlord);
+ cluster.callApi().waitForAllSegmentsToBeAvailable(dataSource, coordinator,
broker);
+
+ Assertions.assertEquals(7, getTotalRowCount());
+
+ VirtualColumns virtualColumns = VirtualColumns.create(
+ new ExpressionVirtualColumn("v0", "json_value(obj, '$.a')",
ColumnType.STRING, TestExprMacroTable.INSTANCE)
+ );
+
+ InlineSchemaDataSourceCompactionConfig config =
+ InlineSchemaDataSourceCompactionConfig
+ .builder()
+ .forDataSource(dataSource)
+ .withSkipOffsetFromLatest(Period.seconds(0))
+ .withTransformSpec(
+ new CompactionTransformSpec(
+ null,
+ virtualColumns
+ )
+ )
+ .withTuningConfig(
+ new UserCompactionTaskQueryTuningConfig(
Review Comment:
a recent pr added a builder for this
##########
processing/src/test/java/org/apache/druid/query/filter/FilterSegmentPrunerTest.java:
##########
@@ -70,15 +72,71 @@ void testPrune()
List<DataSegment> segs = List.of(seg1, seg2, seg3, seg4, seg5, seg6, seg7);
- FilterSegmentPruner prunerRange = new FilterSegmentPruner(range_a, null);
- FilterSegmentPruner prunerEmptyFields = new FilterSegmentPruner(range_a,
Collections.emptySet());
- FilterSegmentPruner prunerExpression = new
FilterSegmentPruner(expression_b, null);
+ FilterSegmentPruner prunerRange = new FilterSegmentPruner(range_a, null,
null);
+ FilterSegmentPruner prunerEmptyFields = new FilterSegmentPruner(range_a,
Collections.emptySet(), null);
+ FilterSegmentPruner prunerExpression = new
FilterSegmentPruner(expression_b, null, null);
Assertions.assertEquals(Set.of(seg1, seg4, seg5, seg6, seg7),
prunerRange.prune(segs, Function.identity()));
Assertions.assertEquals(Set.copyOf(segs), prunerExpression.prune(segs,
Function.identity()));
Assertions.assertEquals(Set.copyOf(segs), prunerEmptyFields.prune(segs,
Function.identity()));
}
+ @Test
+ void testPruneVirtualColumn()
+ {
+ VirtualColumns shardVirtualColumns = VirtualColumns.create(
+ new ExpressionVirtualColumn("vdim1", "concat(dim1, 'foo')",
ColumnType.STRING, TestExprMacroTable.INSTANCE)
+ );
+ String interval1 = "2026-02-18T00:00:00Z/2026-02-19T00:00:00Z";
+
+ DataSegment seg1 = makeDataSegment(
+ interval1,
+ makeRange(List.of("vdim1"), shardVirtualColumns, 0, null,
StringTuple.create("abcfoo"))
+ );
+ DataSegment seg2 = makeDataSegment(
+ interval1,
+ makeRange(List.of("vdim1"), shardVirtualColumns, 1,
StringTuple.create("abcfoo"), StringTuple.create("lmnfoo"))
+ );
+ DataSegment seg3 = makeDataSegment(
+ interval1,
+ makeRange(List.of("vdim1"), shardVirtualColumns, 2,
StringTuple.create("lmnfoo"), null)
+ );
+
+ List<DataSegment> segs = List.of(seg1, seg2, seg3);
+
+ // same expression, same name
+ VirtualColumns queryVirtualColumns = VirtualColumns.create(
+ new ExpressionVirtualColumn("vdim1", "concat(dim1, 'foo')",
ColumnType.STRING, TestExprMacroTable.INSTANCE)
+ );
+ DimFilter range_a = new RangeFilter("vdim1", ColumnType.STRING, null,
"aaa", null, null, null);
+ FilterSegmentPruner prunerRange = new FilterSegmentPruner(range_a, null,
queryVirtualColumns);
+ FilterSegmentPruner prunerEmptyFields = new FilterSegmentPruner(range_a,
Collections.emptySet(), queryVirtualColumns);
+ Assertions.assertEquals(Set.of(seg1), prunerRange.prune(segs,
Function.identity()));
+ Assertions.assertEquals(Set.copyOf(segs), prunerEmptyFields.prune(segs,
Function.identity()));
+
+ // same expression, different name
+ queryVirtualColumns = VirtualColumns.create(
+ new ExpressionVirtualColumn("v0", "concat(dim1, 'foo')",
ColumnType.STRING, TestExprMacroTable.INSTANCE)
+ );
+ range_a = new RangeFilter("v0", ColumnType.STRING, null, "aaa", null,
null, null);
+ prunerRange = new FilterSegmentPruner(range_a, null, queryVirtualColumns);
+ prunerEmptyFields = new FilterSegmentPruner(range_a,
Collections.emptySet(), queryVirtualColumns);
+
+ Assertions.assertEquals(Set.of(seg1), prunerRange.prune(segs,
Function.identity()));
+ Assertions.assertEquals(Set.copyOf(segs), prunerEmptyFields.prune(segs,
Function.identity()));
+
+ // same expression, different name
+ queryVirtualColumns = VirtualColumns.create(
+ new ExpressionVirtualColumn("v10", "concat(dim1, 'foo')",
ColumnType.STRING, TestExprMacroTable.INSTANCE)
+ );
+ range_a = new RangeFilter("v10", ColumnType.STRING, null, "aaa", null,
null, null);
+ prunerRange = new FilterSegmentPruner(range_a, null, queryVirtualColumns);
+ prunerEmptyFields = new FilterSegmentPruner(range_a,
Collections.emptySet(), queryVirtualColumns);
+
+ Assertions.assertEquals(Set.of(seg1), prunerRange.prune(segs,
Function.identity()));
+ Assertions.assertEquals(Set.copyOf(segs), prunerEmptyFields.prune(segs,
Function.identity()));
Review Comment:
are these testing different things? they look the same to me, just different
vc names with neither matching the shard vc
##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/compact/CompactionSupervisorTest.java:
##########
@@ -575,6 +527,218 @@ public void
test_cascadingReindexing_withVirtualColumnOnNestedData_filtersCorrec
Assertions.assertTrue(count > 0);
}
+ @Test
+ public void test_compaction_cluster_by_virtualcolumn()
+ {
+ // Virtual Columns on nested data is only supported with MSQ compaction
engine right now.
+ CompactionEngine compactionEngine = CompactionEngine.MSQ;
+ configureCompaction(compactionEngine, null);
+
+ String jsonDataWithNestedColumn =
+ """
+ {"timestamp": "2023-01-01T00:00:00", "str":"a", "obj":{"a":
"ll"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"", "obj":{"a":
"mm"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"null", "obj":{"a":
"nn"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"b", "obj":{"a":
"oo"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"c", "obj":{"a":
"pp"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"d", "obj":{"a":
"qq"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":null, "obj":{"a":
"rr"}}
+ """;
+
+ final TaskBuilder.Index task = TaskBuilder
+ .ofTypeIndex()
+ .dataSource(dataSource)
+ .jsonInputFormat()
+ .inlineInputSourceWithData(jsonDataWithNestedColumn)
+ .isoTimestampColumn("timestamp")
+ .schemaDiscovery()
+ .granularitySpec("DAY", null, false);
+
+ cluster.callApi().runTask(task.withId(IdUtils.getRandomId()), overlord);
+ cluster.callApi().waitForAllSegmentsToBeAvailable(dataSource, coordinator,
broker);
+
+ Assertions.assertEquals(7, getTotalRowCount());
+
+ VirtualColumns virtualColumns = VirtualColumns.create(
+ new ExpressionVirtualColumn("v0", "json_value(obj, '$.a')",
ColumnType.STRING, TestExprMacroTable.INSTANCE)
+ );
+
+ InlineSchemaDataSourceCompactionConfig config =
+ InlineSchemaDataSourceCompactionConfig
+ .builder()
+ .forDataSource(dataSource)
+ .withSkipOffsetFromLatest(Period.seconds(0))
+ .withTransformSpec(
+ new CompactionTransformSpec(
+ null,
+ virtualColumns
+ )
+ )
+ .withTuningConfig(
+ new UserCompactionTaskQueryTuningConfig(
+ null,
+ null,
+ null,
+ null,
+ null,
+ new DimensionRangePartitionsSpec(4, null, List.of("v0"),
false),
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ null
+ )
+ )
+ .build();
+
+ runCompactionWithSpec(config);
+ waitForAllCompactionTasksToFinish();
+
+ cluster.callApi().waitForAllSegmentsToBeAvailable(dataSource, coordinator,
broker);
+
+ List<DataSegment> segments =
cluster.callApi().getVisibleUsedSegments(dataSource,
overlord).stream().toList();
+ Assertions.assertEquals(2, segments.size());
+ Assertions.assertEquals(
+ new DimensionRangeShardSpec(
+ List.of("v0"),
+ virtualColumns,
+ null,
+ StringTuple.create("oo"),
+ 0,
+ 2
+ ),
+ segments.get(0).getShardSpec()
+ );
+ Assertions.assertEquals(
+ new DimensionRangeShardSpec(
+ List.of("v0"),
+ virtualColumns,
+ StringTuple.create("oo"),
+ null,
+ 1,
+ 2
+ ),
+ segments.get(1).getShardSpec()
+ );
+ }
+
+ @Test
+ public void test_compaction_cluster_by_virtualcolumn_rollup()
+ {
+ // Virtual Columns on nested data is only supported with MSQ compaction
engine right now.
+ CompactionEngine compactionEngine = CompactionEngine.MSQ;
+ configureCompaction(compactionEngine, null);
+
+ String jsonDataWithNestedColumn =
+ """
+ {"timestamp": "2023-01-01T00:00:00", "str":"a", "obj":{"a":
"ll"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"", "obj":{"a":
"mm"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"null", "obj":{"a":
"nn"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"b", "obj":{"a":
"oo"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"c", "obj":{"a":
"pp"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":"d", "obj":{"a":
"qq"}}
+ {"timestamp": "2023-01-01T00:00:00", "str":null, "obj":{"a":
"rr"}}
+ """;
+
+ final TaskBuilder.Index task = TaskBuilder
+ .ofTypeIndex()
+ .dataSource(dataSource)
+ .jsonInputFormat()
+ .inlineInputSourceWithData(jsonDataWithNestedColumn)
+ .isoTimestampColumn("timestamp")
+ .schemaDiscovery()
+ .dataSchema(builder -> builder.withAggregators(new
CountAggregatorFactory("count")))
+ .granularitySpec("DAY", "MINUTE", true);
+
+ cluster.callApi().runTask(task.withId(IdUtils.getRandomId()), overlord);
+ cluster.callApi().waitForAllSegmentsToBeAvailable(dataSource, coordinator,
broker);
+
+ Assertions.assertEquals(7, getTotalRowCount());
+
+ VirtualColumns virtualColumns = VirtualColumns.create(
+ new ExpressionVirtualColumn(
+ "v0",
+ "json_value(obj, '$.a')",
+ ColumnType.STRING,
+ TestExprMacroTable.INSTANCE
+ )
+ );
+
+ InlineSchemaDataSourceCompactionConfig config =
+ InlineSchemaDataSourceCompactionConfig
+ .builder()
+ .forDataSource(dataSource)
+ .withSkipOffsetFromLatest(Period.seconds(0))
+ .withTransformSpec(
+ new CompactionTransformSpec(
+ null,
+ virtualColumns
+ )
+ )
+ .withTuningConfig(
+ new UserCompactionTaskQueryTuningConfig(
Review Comment:
same builder nit
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]