On Tue, 27 Aug 2024 09:47:20 GMT, Per Minborg <pminb...@openjdk.org> wrote:
>> As discussed offline, can't we use a stable array of functions or something >> like that which can be populated lazily? That way you can access the >> function you want in a single array access, and we could put all these >> helper methods somewhere else. > > Unfortunately, a stable array of functions/MethodHandles didn't work from a > performance perspective. > Here is a benchmark that fills segments of various random sizes: without proper branch misses perf counters is difficult to say if it is actually messing up with the Apple MX branch pred... For my Ryzen this is the test which mess up with the branch prediction (which is fairly good in AMD); clearly not inlining `fill` is a trick to make `MemorySegment::fill` inlined and still makes the branch predictor targets "stable" for our purposes import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.CompilerControl; import org.openjdk.jmh.annotations.Fork; import org.openjdk.jmh.annotations.Measurement; import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; import java.lang.foreign.MemorySegment; import java.util.Random; import java.util.concurrent.TimeUnit; @BenchmarkMode(Mode.AverageTime) @Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS) @Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS) @State(Scope.Thread) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Fork(value = 3) public class TestFill { @Param({"false", "true"}) private boolean shuffle; private MemorySegment[] segments; @Param({ "1024", "128000"}) private int samples; private byte[] segmentSequence; @Setup public void setup() { segments = new MemorySegment[8]; // still allocates 8 different arrays for (int i = 0; i < 8; i++) { // we always pay the most of the cost here, for fun byte[] a = shuffle? new byte[i + 1] : new byte[8]; segments[i] = MemorySegment.ofArray(a); } segmentSequence = new byte[samples]; var rnd = new Random(42); for(int i = 0; i < samples; i++) { // if shuffle == false always fall into the "worst" case of populating 8 bytes segmentSequence[i] = (byte) rnd.nextInt(0, 8); } } @Benchmark public void heap_segment_fill() { var segments = this.segments; for (int nextIndex : segmentSequence) { fill(segments[nextIndex]); } } @CompilerControl(CompilerControl.Mode.DONT_INLINE) public void fill(MemorySegment segment) { segment.fill((byte) 0); } } With # JMH version: 1.34 # VM version: JDK 21, Java HotSpot(TM) 64-Bit Server VM, 21+35-LTS-2513 I got: Which means that despite is not that optimized on JDK 21 still this benchmark mess up enough with the branch predictor that will hit badly as the perf counters shows Benchmark (samples) (shuffle) Mode Cnt Score Error Units TestFill.heap_segment_fill 1024 false avgt 30 10296.595 ± 19.694 ns/op TestFill.heap_segment_fill:CPI 1024 false avgt 3 0.200 ± 0.006 clks/insn TestFill.heap_segment_fill:IPC 1024 false avgt 3 5.006 ± 0.152 insns/clk TestFill.heap_segment_fill:L1-dcache-load-misses 1024 false avgt 3 7.839 ± 35.541 #/op TestFill.heap_segment_fill:L1-dcache-loads 1024 false avgt 3 90908.364 ± 19714.476 #/op TestFill.heap_segment_fill:L1-icache-load-misses 1024 false avgt 3 0.458 ± 1.347 #/op TestFill.heap_segment_fill:L1-icache-loads 1024 false avgt 3 70.000 ± 287.459 #/op TestFill.heap_segment_fill:branch-misses 1024 false avgt 3 8.666 ± 10.013 #/op TestFill.heap_segment_fill:branches 1024 false avgt 3 49674.054 ± 9931.580 #/op TestFill.heap_segment_fill:cycles 1024 false avgt 3 46501.496 ± 8694.782 #/op TestFill.heap_segment_fill:dTLB-load-misses 1024 false avgt 3 0.186 ± 0.549 #/op TestFill.heap_segment_fill:dTLB-loads 1024 false avgt 3 1.426 ± 4.003 #/op TestFill.heap_segment_fill:iTLB-load-misses 1024 false avgt 3 0.126 ± 0.405 #/op TestFill.heap_segment_fill:iTLB-loads 1024 false avgt 3 0.249 ± 0.869 #/op TestFill.heap_segment_fill:instructions 1024 false avgt 3 232778.290 ± 47179.208 #/op TestFill.heap_segment_fill:stalled-cycles-frontend 1024 false avgt 3 257.566 ± 778.186 #/op TestFill.heap_segment_fill 1024 true avgt 30 11003.331 ± 70.467 ns/op TestFill.heap_segment_fill:CPI 1024 true avgt 3 0.208 ± 0.047 clks/insn TestFill.heap_segment_fill:IPC 1024 true avgt 3 4.813 ± 1.077 insns/clk TestFill.heap_segment_fill:L1-dcache-load-misses 1024 true avgt 3 8.734 ± 1.782 #/op TestFill.heap_segment_fill:L1-dcache-loads 1024 true avgt 3 94231.271 ± 4742.906 #/op TestFill.heap_segment_fill:L1-icache-load-misses 1024 true avgt 3 0.506 ± 2.508 #/op TestFill.heap_segment_fill:L1-icache-loads 1024 true avgt 3 83.470 ± 216.408 #/op TestFill.heap_segment_fill:branch-misses 1024 true avgt 3 8.894 ± 8.807 #/op TestFill.heap_segment_fill:branches 1024 true avgt 3 50686.259 ± 404.635 #/op TestFill.heap_segment_fill:cycles 1024 true avgt 3 49969.876 ± 11319.276 #/op TestFill.heap_segment_fill:dTLB-load-misses 1024 true avgt 3 0.187 ± 0.655 #/op TestFill.heap_segment_fill:dTLB-loads 1024 true avgt 3 1.587 ± 3.060 #/op TestFill.heap_segment_fill:iTLB-load-misses 1024 true avgt 3 0.123 ± 0.660 #/op TestFill.heap_segment_fill:iTLB-loads 1024 true avgt 3 0.293 ± 1.287 #/op TestFill.heap_segment_fill:instructions 1024 true avgt 3 240463.595 ± 976.383 #/op TestFill.heap_segment_fill:stalled-cycles-frontend 1024 true avgt 3 255.006 ± 988.846 #/op TestFill.heap_segment_fill 128000 false avgt 30 1259362.873 ± 5934.195 ns/op TestFill.heap_segment_fill:CPI 128000 false avgt 3 0.201 ± 0.025 clks/insn TestFill.heap_segment_fill:IPC 128000 false avgt 3 4.982 ± 0.626 insns/clk TestFill.heap_segment_fill:L1-dcache-load-misses 128000 false avgt 3 2872.859 ± 7141.312 #/op TestFill.heap_segment_fill:L1-dcache-loads 128000 false avgt 3 10657359.179 ± 1907105.367 #/op TestFill.heap_segment_fill:L1-icache-load-misses 128000 false avgt 3 60.908 ± 97.434 #/op TestFill.heap_segment_fill:L1-icache-loads 128000 false avgt 3 8853.079 ± 8185.081 #/op TestFill.heap_segment_fill:branch-misses 128000 false avgt 3 881.014 ± 3001.249 #/op TestFill.heap_segment_fill:branches 128000 false avgt 3 6252293.868 ± 150888.746 #/op TestFill.heap_segment_fill:cycles 128000 false avgt 3 5728074.407 ± 820865.748 #/op TestFill.heap_segment_fill:dTLB-load-misses 128000 false avgt 3 24.925 ± 164.673 #/op TestFill.heap_segment_fill:dTLB-loads 128000 false avgt 3 249.671 ± 987.855 #/op TestFill.heap_segment_fill:iTLB-load-misses 128000 false avgt 3 14.258 ± 47.128 #/op TestFill.heap_segment_fill:iTLB-loads 128000 false avgt 3 34.156 ± 248.858 #/op TestFill.heap_segment_fill:instructions 128000 false avgt 3 28538131.024 ± 526036.510 #/op TestFill.heap_segment_fill:stalled-cycles-frontend 128000 false avgt 3 27932.797 ± 27039.568 #/op TestFill.heap_segment_fill 128000 true avgt 30 1857275.169 ± 4604.437 ns/op TestFill.heap_segment_fill:CPI 128000 true avgt 3 0.288 ± 0.009 clks/insn TestFill.heap_segment_fill:IPC 128000 true avgt 3 3.472 ± 0.109 insns/clk TestFill.heap_segment_fill:L1-dcache-load-misses 128000 true avgt 3 3433.246 ± 15336.162 #/op TestFill.heap_segment_fill:L1-dcache-loads 128000 true avgt 3 12940291.898 ± 4889405.663 #/op TestFill.heap_segment_fill:L1-icache-load-misses 128000 true avgt 3 73.450 ± 231.916 #/op TestFill.heap_segment_fill:L1-icache-loads 128000 true avgt 3 13483.446 ± 42337.545 #/op TestFill.heap_segment_fill:branch-misses 128000 true avgt 3 86493.970 ± 8740.093 #/op TestFill.heap_segment_fill:branches 128000 true avgt 3 6320125.417 ± 998773.918 #/op TestFill.heap_segment_fill:cycles 128000 true avgt 3 8406053.515 ± 1319703.106 #/op TestFill.heap_segment_fill:dTLB-load-misses 128000 true avgt 3 34.833 ± 105.768 #/op TestFill.heap_segment_fill:dTLB-loads 128000 true avgt 3 307.842 ± 754.292 #/op TestFill.heap_segment_fill:iTLB-load-misses 128000 true avgt 3 23.104 ± 51.968 #/op TestFill.heap_segment_fill:iTLB-loads 128000 true avgt 3 55.073 ± 241.755 #/op TestFill.heap_segment_fill:instructions 128000 true avgt 3 29183047.682 ± 4280293.555 #/op TestFill.heap_segment_fill:stalled-cycles-frontend 128000 true avgt 3 707884.732 ± 176201.245 #/op And -prof perfasm correcly show for samples = 128000 and shuffle = true shows ....[Hottest Region 1].............................................................................. libjvm.so, Unsafe_SetMemory0 (82 bytes) Which are likely the branches at https://github.com/openjdk/jdk21/blob/890adb6410dab4606a4f26a942aed02fb2f55387/src/hotspot/share/utilities/copy.cpp#L216-L244 ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/20712#discussion_r1732802685