Possible memory access optimization opportunity? Comparison with Java JIT

Andy via Gcc Sat, 24 May 2025 11:15:54 -0700

Dear GCC developers,

I would like to ask whether there might be room for improvement in memory
access optimization in GCC.


I've prepared a simple benchmark in both C++ (using -std=c++20 for digit
separators like 5'000'000) and Java. The benchmark allocates a large array of
random integers, performs a multiply-sum loop to prevent
dead-code elimination, and measures the time taken to sort the array.
C++ example:

```cpp
#include <algorithm>
#include <chrono>
#include <iostream>
#include <random>
#include <vector>

int main() {
    std::mt19937 rng(1234);
    const int nTrials = 20;
    long sum = 0;
    long long best = std::numeric_limits<long long>::max();
    long long worst = std::numeric_limits<long long>::min();
    long  long totalTime = 0;
    const int size = 5'000'000;
    for (int n = 0; n<nTrials; ++n) {
        std::cout << n+1 << "/" << nTrials << std::endl;
        std::uniform_int_distribution<int> dist(std::numeric_limits<int>::min(),
                                           std::numeric_limits<int>::max());
        std::vector<int> numbers(size);
        for (int& num : numbers) {
            num = dist(rng);
        }
        for (int i = 0; i < size; ++i) {
            sum += i * numbers[i];
        }
        auto start = std::chrono::high_resolution_clock::now();
        std::sort(numbers.begin(), numbers.end());
        auto end = std::chrono::high_resolution_clock::now();
        auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>
                                                      (end - start).count();
        totalTime +=  elapsed;
        if (elapsed < best) best = elapsed;
        if (elapsed > worst) worst = elapsed;
    }
    double avgTime = (double)totalTime / nTrials;
    double avgCTime = (double)(totalTime-worst)/(nTrials-1);;
    std::cout << "(c++) vector sort time " << best / 1e6 << " ms\n";
    std::cout << "nTrials = " << nTrials << "\n";
    std::cout << "size = " << size << "\n";
    std::cout << "worst/best = " << (double)worst/best << "\n";
    std::cout << "avg/best = " << avgTime / best << "\n";
    std::cout << "corrected avg/best = " << avgCTime / best << "\n";
    std::cout << "\n\nresult for prevent eliminate dead code: " << sum << "\n";
    return 0;
}
```

Java example:

```java
import java.util.Arrays;
import java.util.Random;

import static java.lang.System.out;

public class Main {
    public static void main(String[] args) {
        int nTrials = 50;
        long sum = 0;
        long best = Long.MAX_VALUE;
        long worst = Long.MIN_VALUE;
        long totalTime = 0;
        int size = 5_000_000;
        int nWarmupTrials = 6;
        Random rand = new Random(1234);
        for (int n = 0; n < nTrials; n++) {
            if (n < nWarmupTrials)
                System.out.print("warmup... ");
            System.out.printf("%d/%d%n", n, nTrials);
            int[] numbers = new int[size];
            for (int i = 0; i < size; i++) {
                numbers[i] = rand.nextInt();
            }
            for (int i = 0; i < size; i++) {
                sum += (long) i *numbers[i];
            }

            long startTime = System.nanoTime();
            Arrays.sort(numbers);
            long endTime = System.nanoTime();
            long elapsed = endTime - startTime;
            //System.out.printf("  %f%n", (double) elapsed / 1e6);
            if (n < nWarmupTrials)
                continue;
            totalTime += elapsed;
            if (elapsed < best) best = elapsed;
            if (elapsed > worst) worst = elapsed;
        }
        int nEffectiveTrials = nTrials - nWarmupTrials;
        double avgTime = (double)totalTime / nEffectiveTrials;;
        double avgCTime = (double)(totalTime-worst)/(nEffectiveTrials-1);
        System.out.println("java.vm.version=" + System.getProperty
                                                           ("java.vm.version"));
        System.out.println("java.vm.name=" +
System.getProperty("java.vm.name"));
        System.out.println("java.version=" +
System.getProperty("java.version"));
        System.out.printf("(java) vector sort time %f ms%n", (double)
best / 1e6);
        System.out.printf("nTrials=%d = %d (warmUp) + %d (effective)%n",
                nTrials, nWarmupTrials, nEffectiveTrials);
        System.out.printf("size=%d%n", size);
        System.out.printf("worst/best=%f%n", (double) worst / best);
        System.out.printf("avg/best=%f%n", avgTime / best);
        System.out.printf("corrected avg/best=%f%n", avgCTime / best);
        out.println("\n\n" + sum);
    }
}
```

Results

C++ (Release mode, -O3):
```
(c++) vector sort time 410.469 ms
nTrials = 20
size = 5000000
worst/best = 1.0515
avg/best = 1.01533
corrected avg/best = 1.01342
```

Java:
```
java.vm.version=24.0.1+9-30
java.vm.name=OpenJDK 64-Bit Server VM
java.version=24.0.1
(java) vector sort time 69.322373 ms
nTrials=50 = 6 (warmUp) + 44 (effective)
size=5000000
worst/best=1.353855
avg/best=1.094769
corrected avg/best=1.088744
```
Notes:

    The JVM was run with the following options to enable compilation logging
    and assembly output:

-XX:+UnlockDiagnosticVMOptions -XX:+LogCompilation -XX:+PrintAssembly

This produces a file like hotspot_pidXXXXX.log, which can be inspected with
JITWatch.

    Searching for DualPivotQuicksort.sort(...) in the JIT assembly reveals
    very efficient memory access patterns.

Comparative observations:
```
 Layer     | Integer Comparison          | Integer Move
-----------|-----------------------------|------------------------------------
 Assembler | `cmp eax, edx`, `jl`, `jge` | `mov eax, [mem]`, `mov [mem], eax`
 JIT       | `cmp eax, edx`, `jle`, `jg` | `mov eax, [array+offset]`
```
One possible explanation is aliasing: the Java memory model can often prove
there is no aliasing between array accesses, while in C/C++ this is harder.

In C/C++, one can use the restrict qualifier to aid the compiler:

void foo(int* __restrict__ a, int* __restrict__ b);

Would using `restrict` help in this case?
Are there other techniques or flags I should explore to help GCC optimize
this memory access pattern more like the JVM does?

For transparency: I’m also sending a similar message to the LLVM developers
list to get a broader view across compilers.

Thank you in advance for your insights!

Best regards,
Andrzej Borucki

Possible memory access optimization opportunity? Comparison with Java JIT

Reply via email to