On Sat, 24 May 2025, 18:58 Andy via Gcc, <gcc@gcc.gnu.org> wrote: > Dear GCC developers, > > I would like to ask whether there might be room for improvement in memory > access optimization in GCC. > > I've prepared a simple benchmark in both C++ (using -std=c++20 for digit > separators like 5'000'000) and Java. The benchmark allocates a large array > of > random integers, performs a multiply-sum loop to prevent > dead-code elimination, and measures the time taken to sort the array. >
I don't think it's relevant to the performance measurements, but doesn't your C++ example have undefined behaviour due integer overflow? C++ example: > > ```cpp > #include <algorithm> > #include <chrono> > #include <iostream> > #include <random> > #include <vector> > > int main() { > std::mt19937 rng(1234); > const int nTrials = 20; > long sum = 0; > long long best = std::numeric_limits<long long>::max(); > long long worst = std::numeric_limits<long long>::min(); > long long totalTime = 0; > const int size = 5'000'000; > for (int n = 0; n<nTrials; ++n) { > std::cout << n+1 << "/" << nTrials << std::endl; > std::uniform_int_distribution<int> > dist(std::numeric_limits<int>::min(), > > std::numeric_limits<int>::max()); > std::vector<int> numbers(size); > for (int& num : numbers) { > num = dist(rng); > } > for (int i = 0; i < size; ++i) { > sum += i * numbers[i]; > } > auto start = std::chrono::high_resolution_clock::now(); > std::sort(numbers.begin(), numbers.end()); > auto end = std::chrono::high_resolution_clock::now(); > auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds> > (end - > start).count(); > totalTime += elapsed; > if (elapsed < best) best = elapsed; > if (elapsed > worst) worst = elapsed; > } > double avgTime = (double)totalTime / nTrials; > double avgCTime = (double)(totalTime-worst)/(nTrials-1);; > std::cout << "(c++) vector sort time " << best / 1e6 << " ms\n"; > std::cout << "nTrials = " << nTrials << "\n"; > std::cout << "size = " << size << "\n"; > std::cout << "worst/best = " << (double)worst/best << "\n"; > std::cout << "avg/best = " << avgTime / best << "\n"; > std::cout << "corrected avg/best = " << avgCTime / best << "\n"; > std::cout << "\n\nresult for prevent eliminate dead code: " << sum << > "\n"; > return 0; > } > ``` > > Java example: > > ```java > import java.util.Arrays; > import java.util.Random; > > import static java.lang.System.out; > > public class Main { > public static void main(String[] args) { > int nTrials = 50; > long sum = 0; > long best = Long.MAX_VALUE; > long worst = Long.MIN_VALUE; > long totalTime = 0; > int size = 5_000_000; > int nWarmupTrials = 6; > Random rand = new Random(1234); > for (int n = 0; n < nTrials; n++) { > if (n < nWarmupTrials) > System.out.print("warmup... "); > System.out.printf("%d/%d%n", n, nTrials); > int[] numbers = new int[size]; > for (int i = 0; i < size; i++) { > numbers[i] = rand.nextInt(); > } > for (int i = 0; i < size; i++) { > sum += (long) i *numbers[i]; > } > > long startTime = System.nanoTime(); > Arrays.sort(numbers); > long endTime = System.nanoTime(); > long elapsed = endTime - startTime; > //System.out.printf(" %f%n", (double) elapsed / 1e6); > if (n < nWarmupTrials) > continue; > totalTime += elapsed; > if (elapsed < best) best = elapsed; > if (elapsed > worst) worst = elapsed; > } > int nEffectiveTrials = nTrials - nWarmupTrials; > double avgTime = (double)totalTime / nEffectiveTrials;; > double avgCTime = (double)(totalTime-worst)/(nEffectiveTrials-1); > System.out.println("java.vm.version=" + System.getProperty > > ("java.vm.version")); > System.out.println("java.vm.name=" + > System.getProperty("java.vm.name")); > System.out.println("java.version=" + > System.getProperty("java.version")); > System.out.printf("(java) vector sort time %f ms%n", (double) > best / 1e6); > System.out.printf("nTrials=%d = %d (warmUp) + %d (effective)%n", > nTrials, nWarmupTrials, nEffectiveTrials); > System.out.printf("size=%d%n", size); > System.out.printf("worst/best=%f%n", (double) worst / best); > System.out.printf("avg/best=%f%n", avgTime / best); > System.out.printf("corrected avg/best=%f%n", avgCTime / best); > out.println("\n\n" + sum); > } > } > ``` > > Results > > C++ (Release mode, -O3): > ``` > (c++) vector sort time 410.469 ms > nTrials = 20 > size = 5000000 > worst/best = 1.0515 > avg/best = 1.01533 > corrected avg/best = 1.01342 > ``` > > Java: > ``` > java.vm.version=24.0.1+9-30 > java.vm.name=OpenJDK 64-Bit Server VM > java.version=24.0.1 > (java) vector sort time 69.322373 ms > nTrials=50 = 6 (warmUp) + 44 (effective) > size=5000000 > worst/best=1.353855 > avg/best=1.094769 > corrected avg/best=1.088744 > ``` > Notes: > > The JVM was run with the following options to enable compilation > logging > and assembly output: > > -XX:+UnlockDiagnosticVMOptions -XX:+LogCompilation -XX:+PrintAssembly > > This produces a file like hotspot_pidXXXXX.log, which can be inspected with > JITWatch. > > Searching for DualPivotQuicksort.sort(...) in the JIT assembly reveals > very efficient memory access patterns. > > Comparative observations: > ``` > Layer | Integer Comparison | Integer Move > > -----------|-----------------------------|------------------------------------ > Assembler | `cmp eax, edx`, `jl`, `jge` | `mov eax, [mem]`, `mov [mem], > eax` > JIT | `cmp eax, edx`, `jle`, `jg` | `mov eax, [array+offset]` > ``` > One possible explanation is aliasing: the Java memory model can often prove > there is no aliasing between array accesses, while in C/C++ this is harder. > > In C/C++, one can use the restrict qualifier to aid the compiler: > > void foo(int* __restrict__ a, int* __restrict__ b); > > Would using `restrict` help in this case? > Are there other techniques or flags I should explore to help GCC optimize > this memory access pattern more like the JVM does? > > For transparency: I’m also sending a similar message to the LLVM developers > list to get a broader view across compilers. > > Thank you in advance for your insights! > > Best regards, > Andrzej Borucki >