On Tue, 3 Sep 2024 07:52:44 GMT, Per Minborg <pminb...@openjdk.org> wrote:
> This PR proposes to handle smaller FFM copy operations with Java code rather > than transitioning to native code. This will improve performance. In this PR, > copy operations involving zero to 63 bytes will be handled by Java code. > > Here is what it looks like for Windows x64: > >  > > Here is another chart for Linux a64: > >  > > Other platforms exhibit similar behavior. It should be noted that the gain > with this PR is pronounced for certain common sizes that are more likely to > appear in code (e.g. 8, 16, 24, and 32) > > It would be possible to use the same code path for the 7arg > `MemorySegment::copy` method if it is similar to: > > > MemorySegment.copy(heapSrcSegment, JAVA_BYTE, 0, heapDstSegment, JAVA_BYTE, > 0, ELEM_SIZE); > > > This could be added in a separate PR. > > This PR has been tested with tier1-3 and passed. I would suggest this additional benchmark as well import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.CompilerControl; import org.openjdk.jmh.annotations.Fork; import org.openjdk.jmh.annotations.Measurement; import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.BenchmarkParams; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; import java.nio.ByteBuffer; import java.util.concurrent.TimeUnit; @BenchmarkMode(Mode.AverageTime) @Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS) @Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS) @State(Scope.Thread) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Fork(value = 3) public class PolluteCopyTest { @Param({"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "36", "40", "44", "48", "52", "56", "60", "63", "64", "128"}) public int ELEM_SIZE; MemorySegment heapSrcSegment; MemorySegment heapDstSegment; MemorySegment nativeSrcSegment; MemorySegment nativeDstSegment; @Param({"false", "true"}) public boolean polluteCopy; @Setup public void setup(BenchmarkParams params) { byte[] srcArray = new byte[ELEM_SIZE]; byte[] dstArray = new byte[ELEM_SIZE]; heapSrcSegment = MemorySegment.ofArray(srcArray); heapDstSegment = MemorySegment.ofArray(dstArray); nativeSrcSegment = Arena.ofAuto().allocate(ELEM_SIZE); nativeDstSegment = Arena.ofAuto().allocate(ELEM_SIZE); if (polluteCopy) { if (params.getBenchmark().contains("not_inlined")) { for (int i = 0; i < 15_000; i++) { heap_segment_copy5Arg_not_inlined(); native_segment_copy5Arg_not_inlined(); } } else { for (int i = 0; i < 15_000; i++) { MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE); MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE); MemorySegment.copy(heapSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE); MemorySegment.copy(nativeSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE); } } } } @Benchmark @CompilerControl(CompilerControl.Mode.DONT_INLINE) public void heap_segment_copy5Arg_not_inlined() { MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE); } @Benchmark @CompilerControl(CompilerControl.Mode.DONT_INLINE) public void native_segment_copy5Arg_not_inlined() { MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE); } @Benchmark public void heap_segment_copy5Arg() { MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE); } @Benchmark public void native_segment_copy5Arg() { MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE); } } which is not super stable (we can disable background compilation and enable type profiling - to make sure that we are doing things right by the end of `Setup` - or warmup at least) It pollutes the type profile of existing copy methods in different ways - but while compiled, it will eventually depends on how types are handled. There's no branch mispredict here, but a "what happen if we move the ops to java and java is type "poisoned/pollute"?" - because is what could happen in the real world (a mix of heap/off-heap segment types) and we want to capture what this PR get in such case as well. ------------- PR Comment: https://git.openjdk.org/jdk/pull/20829#issuecomment-2326404582