Jackie-Jiang commented on PR #12354:
URL: https://github.com/apache/pinot/pull/12354#issuecomment-1961924193
I tried the following micro-benchmark and see very interesting result:
Code:
```
package org.apache.pinot.perf;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.function.LongSupplier;
import org.apache.pinot.common.request.context.ExpressionContext;
import org.apache.pinot.core.common.BlockValSet;
import org.apache.pinot.core.plan.DocIdSetPlanNode;
import org.apache.pinot.core.query.aggregation.AggregationResultHolder;
import org.apache.pinot.core.query.aggregation.DoubleAggregationResultHolder;
import org.apache.pinot.core.query.aggregation.function.AggregationFunction;
import
org.apache.pinot.core.query.aggregation.function.SumAggregationFunction;
import
org.apache.pinot.core.query.aggregation.function.SumAggregationFunctionFoldDouble;
import
org.apache.pinot.core.query.aggregation.function.SumAggregationFunctionFoldHolder;
import
org.apache.pinot.core.query.aggregation.function.SumAggregationFunctionFoldPrimitive;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.jetbrains.annotations.Nullable;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import org.roaringbitmap.RoaringBitmap;
@Fork(1)
@BenchmarkMode(Mode.Throughput)
@Warmup(iterations = 2, time = 1)
@Measurement(iterations = 3, time = 1)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
public class BenchmarkSumAggregation2 {
public static void main(String[] args)
throws RunnerException {
Options opt = new
OptionsBuilder().include(BenchmarkSumAggregation2.class.getSimpleName())
// .addProfiler(GCProfiler.class)
.build();
new Runner(opt).run();
}
private static final ExpressionContext EXPR =
ExpressionContext.forIdentifier("col");
@Param({"true", "false"})
public boolean _nullHandlingEnabled;
@Param({"2", "4", "8", "16", "32", "64", "128"})
public int _nullInterval;
@Param({"normal", "foldDouble", "foldPrimitive"})
public String _impl;
private AggregationFunction _aggregationFunction;
private AggregationResultHolder _resultHolder;
private RoaringBitmap _nullBitmap;
private long[] _values;
private Map<ExpressionContext, BlockValSet> _blockValSetMap;
private double _expectedSum;
private double _nextExpectedSum;
@Setup(Level.Trial)
public void setup()
throws IOException {
switch (_impl) {
case "normal":
_aggregationFunction = new SumAggregationFunction(List.of(EXPR),
_nullHandlingEnabled);
break;
case "foldDouble":
_aggregationFunction = new
SumAggregationFunctionFoldDouble(List.of(EXPR), _nullHandlingEnabled);
break;
case "foldPrimitive":
_aggregationFunction = new
SumAggregationFunctionFoldPrimitive(List.of(EXPR), _nullHandlingEnabled);
break;
case "foldHolder":
_aggregationFunction = new
SumAggregationFunctionFoldHolder(List.of(EXPR), _nullHandlingEnabled);
break;
default:
throw new IllegalArgumentException("Unknown impl: " + _impl);
}
_resultHolder = _aggregationFunction.createAggregationResultHolder();
_nullBitmap = createNullBitmap();
_values = new long[DocIdSetPlanNode.MAX_DOC_PER_CALL];
LongSupplier longSupplier = Distribution.createLongSupplier(42,
"EXP(0.5)");
for (int i = 0; i < DocIdSetPlanNode.MAX_DOC_PER_CALL; i++) {
_values[i] = longSupplier.getAsLong();
}
_expectedSum = 0;
for (int i = 0; i < DocIdSetPlanNode.MAX_DOC_PER_CALL; i++) {
if (_nullBitmap == null || !_nullBitmap.contains(i)) {
_expectedSum += _values[i];
}
}
_nextExpectedSum = _expectedSum;
_blockValSetMap = Map.of(EXPR, new BenchmarkBlockValSet(_nullBitmap,
_values));
}
private RoaringBitmap createNullBitmap() {
if (_nullHandlingEnabled) {
RoaringBitmap nullBitmap = new RoaringBitmap();
for (int i = 0; i < DocIdSetPlanNode.MAX_DOC_PER_CALL; i +=
_nullInterval) {
nullBitmap.add(i);
}
return nullBitmap;
} else {
return null;
}
}
private static class BenchmarkBlockValSet implements BlockValSet {
final RoaringBitmap _nullBitmap;
final long[] _values;
private BenchmarkBlockValSet(@Nullable RoaringBitmap nullBitmap, long[]
values) {
_nullBitmap = nullBitmap;
_values = values;
}
@Nullable
@Override
public RoaringBitmap getNullBitmap() {
return _nullBitmap;
}
@Override
public DataType getValueType() {
return DataType.LONG;
}
@Override
public boolean isSingleValue() {
return true;
}
@Nullable
@Override
public Dictionary getDictionary() {
throw new UnsupportedOperationException();
}
@Override
public int[] getDictionaryIdsSV() {
throw new UnsupportedOperationException();
}
@Override
public int[] getIntValuesSV() {
throw new UnsupportedOperationException();
}
@Override
public long[] getLongValuesSV() {
return _values;
}
@Override
public float[] getFloatValuesSV() {
throw new UnsupportedOperationException();
}
@Override
public double[] getDoubleValuesSV() {
throw new UnsupportedOperationException();
}
@Override
public BigDecimal[] getBigDecimalValuesSV() {
throw new UnsupportedOperationException();
}
@Override
public String[] getStringValuesSV() {
throw new UnsupportedOperationException();
}
@Override
public byte[][] getBytesValuesSV() {
throw new UnsupportedOperationException();
}
@Override
public int[][] getDictionaryIdsMV() {
throw new UnsupportedOperationException();
}
@Override
public int[][] getIntValuesMV() {
throw new UnsupportedOperationException();
}
@Override
public long[][] getLongValuesMV() {
throw new UnsupportedOperationException();
}
@Override
public float[][] getFloatValuesMV() {
throw new UnsupportedOperationException();
}
@Override
public double[][] getDoubleValuesMV() {
throw new UnsupportedOperationException();
}
@Override
public String[][] getStringValuesMV() {
throw new UnsupportedOperationException();
}
@Override
public byte[][][] getBytesValuesMV() {
throw new UnsupportedOperationException();
}
@Override
public int[] getNumMVEntries() {
throw new UnsupportedOperationException();
}
}
@Benchmark
public void test(Blackhole bh) {
_aggregationFunction.aggregate(DocIdSetPlanNode.MAX_DOC_PER_CALL,
_resultHolder, _blockValSetMap);
if (_resultHolder instanceof DoubleAggregationResultHolder) {
double result = _resultHolder.getDoubleResult();
if (result != _nextExpectedSum) {
throw new IllegalStateException("Expected: " + _nextExpectedSum + ",
got: " + result);
}
bh.consume(result);
} else {
Double result = _resultHolder.getResult();
if (result != _nextExpectedSum) {
throw new IllegalStateException("Expected: " + _nextExpectedSum + ",
got: " + _resultHolder.getResult());
}
bh.consume(result);
}
_nextExpectedSum += _expectedSum;
}
}
```
Result (`foldHolder` doesn't give correct result, please take a look):
```
Benchmark (_impl) (_nullHandlingEnabled)
(_nullInterval) Mode Cnt Score Error Units
BenchmarkSumAggregation2.test normal true
2 thrpt 3 39.276 ± 20.887 ops/ms
BenchmarkSumAggregation2.test normal true
4 thrpt 3 2.328 ± 3.333 ops/ms
BenchmarkSumAggregation2.test normal true
8 thrpt 3 3.280 ± 1.050 ops/ms
BenchmarkSumAggregation2.test normal true
16 thrpt 3 4.128 ± 1.031 ops/ms
BenchmarkSumAggregation2.test normal true
32 thrpt 3 4.516 ± 0.757 ops/ms
BenchmarkSumAggregation2.test normal true
64 thrpt 3 5.817 ± 0.127 ops/ms
BenchmarkSumAggregation2.test normal true
128 thrpt 3 5.833 ± 0.155 ops/ms
BenchmarkSumAggregation2.test normal false
2 thrpt 3 91.698 ± 16.053 ops/ms
BenchmarkSumAggregation2.test normal false
4 thrpt 3 90.171 ± 12.357 ops/ms
BenchmarkSumAggregation2.test normal false
8 thrpt 3 93.684 ± 4.769 ops/ms
BenchmarkSumAggregation2.test normal false
16 thrpt 3 91.339 ± 49.780 ops/ms
BenchmarkSumAggregation2.test normal false
32 thrpt 3 92.072 ± 13.054 ops/ms
BenchmarkSumAggregation2.test normal false
64 thrpt 3 92.704 ± 4.640 ops/ms
BenchmarkSumAggregation2.test normal false
128 thrpt 3 91.891 ± 20.353 ops/ms
BenchmarkSumAggregation2.test foldDouble true
2 thrpt 3 28.216 ± 0.871 ops/ms
BenchmarkSumAggregation2.test foldDouble true
4 thrpt 3 49.684 ± 12.845 ops/ms
BenchmarkSumAggregation2.test foldDouble true
8 thrpt 3 88.009 ± 24.841 ops/ms
BenchmarkSumAggregation2.test foldDouble true
16 thrpt 3 123.312 ± 11.984 ops/ms
BenchmarkSumAggregation2.test foldDouble true
32 thrpt 3 116.148 ± 65.820 ops/ms
BenchmarkSumAggregation2.test foldDouble true
64 thrpt 3 123.562 ± 23.135 ops/ms
BenchmarkSumAggregation2.test foldDouble true
128 thrpt 3 115.864 ± 116.102 ops/ms
BenchmarkSumAggregation2.test foldDouble false
2 thrpt 3 89.577 ± 69.898 ops/ms
BenchmarkSumAggregation2.test foldDouble false
4 thrpt 3 91.662 ± 35.556 ops/ms
BenchmarkSumAggregation2.test foldDouble false
8 thrpt 3 91.258 ± 17.352 ops/ms
BenchmarkSumAggregation2.test foldDouble false
16 thrpt 3 90.990 ± 27.492 ops/ms
BenchmarkSumAggregation2.test foldDouble false
32 thrpt 3 91.151 ± 22.358 ops/ms
BenchmarkSumAggregation2.test foldDouble false
64 thrpt 3 92.060 ± 3.458 ops/ms
BenchmarkSumAggregation2.test foldDouble false
128 thrpt 3 91.715 ± 4.820 ops/ms
BenchmarkSumAggregation2.test foldPrimitive true
2 thrpt 3 26.680 ± 7.063 ops/ms
BenchmarkSumAggregation2.test foldPrimitive true
4 thrpt 3 46.278 ± 5.670 ops/ms
BenchmarkSumAggregation2.test foldPrimitive true
8 thrpt 3 80.658 ± 33.080 ops/ms
BenchmarkSumAggregation2.test foldPrimitive true
16 thrpt 3 149.492 ± 30.142 ops/ms
BenchmarkSumAggregation2.test foldPrimitive true
32 thrpt 3 225.837 ± 63.126 ops/ms
BenchmarkSumAggregation2.test foldPrimitive true
64 thrpt 3 186.245 ± 24.884 ops/ms
BenchmarkSumAggregation2.test foldPrimitive true
128 thrpt 3 198.177 ± 73.624 ops/ms
BenchmarkSumAggregation2.test foldPrimitive false
2 thrpt 3 364.593 ± 44.298 ops/ms
BenchmarkSumAggregation2.test foldPrimitive false
4 thrpt 3 359.073 ± 194.996 ops/ms
BenchmarkSumAggregation2.test foldPrimitive false
8 thrpt 3 362.277 ± 92.352 ops/ms
BenchmarkSumAggregation2.test foldPrimitive false
16 thrpt 3 362.172 ± 52.064 ops/ms
BenchmarkSumAggregation2.test foldPrimitive false
32 thrpt 3 350.423 ± 193.227 ops/ms
BenchmarkSumAggregation2.test foldPrimitive false
64 thrpt 3 350.067 ± 107.294 ops/ms
BenchmarkSumAggregation2.test foldPrimitive false
128 thrpt 3 361.977 ± 128.563 ops/ms
```
As you can see, new implementation is giving much better performance when
null is enabled, and similar performance when null is disabled. `foldPrimitive`
gives even better performance, even though we cannot use it because of the
potential overflow.
I tried this on my laptop. Can you try this benchmark on your work station
and see if you can produce the results? Maybe we can get more insights from it.
Some improvements we should do for the new implementation:
- When null is enabled, we should use `ObjectAggregationResultHolder` to
handle the case of all values are `null`
- Fix the `foldHolder` implementation to give correct result
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]