Re: Possible memory access optimization opportunity? Comparison with Java JIT

2025-05-24 Thread Jonathan Wakely via Gcc
On Sat, 24 May 2025, 18:58 Andy via Gcc,  wrote:

> Dear GCC developers,
>
> I would like to ask whether there might be room for improvement in memory
> access optimization in GCC.
>
> I've prepared a simple benchmark in both C++ (using -std=c++20 for digit
> separators like 5'000'000) and Java. The benchmark allocates a large array
> of
> random integers, performs a multiply-sum loop to prevent
> dead-code elimination, and measures the time taken to sort the array.
>

I don't think it's relevant to the performance measurements, but doesn't
your C++ example have undefined behaviour due integer overflow?



C++ example:
>
> ```cpp
> #include 
> #include 
> #include 
> #include 
> #include 
>
> int main() {
> std::mt19937 rng(1234);
> const int nTrials = 20;
> long sum = 0;
> long long best = std::numeric_limits::max();
> long long worst = std::numeric_limits::min();
> long  long totalTime = 0;
> const int size = 5'000'000;
> for (int n = 0; n std::cout << n+1 << "/" << nTrials << std::endl;
> std::uniform_int_distribution
> dist(std::numeric_limits::min(),
>
>  std::numeric_limits::max());
> std::vector numbers(size);
> for (int& num : numbers) {
> num = dist(rng);
> }
> for (int i = 0; i < size; ++i) {
> sum += i * numbers[i];
> }
> auto start = std::chrono::high_resolution_clock::now();
> std::sort(numbers.begin(), numbers.end());
> auto end = std::chrono::high_resolution_clock::now();
> auto elapsed = std::chrono::duration_cast
>   (end -
> start).count();
> totalTime +=  elapsed;
> if (elapsed < best) best = elapsed;
> if (elapsed > worst) worst = elapsed;
> }
> double avgTime = (double)totalTime / nTrials;
> double avgCTime = (double)(totalTime-worst)/(nTrials-1);;
> std::cout << "(c++) vector sort time " << best / 1e6 << " ms\n";
> std::cout << "nTrials = " << nTrials << "\n";
> std::cout << "size = " << size << "\n";
> std::cout << "worst/best = " << (double)worst/best << "\n";
> std::cout << "avg/best = " << avgTime / best << "\n";
> std::cout << "corrected avg/best = " << avgCTime / best << "\n";
> std::cout << "\n\nresult for prevent eliminate dead code: " << sum <<
> "\n";
> return 0;
> }
> ```
>
> Java example:
>
> ```java
> import java.util.Arrays;
> import java.util.Random;
>
> import static java.lang.System.out;
>
> public class Main {
> public static void main(String[] args) {
> int nTrials = 50;
> long sum = 0;
> long best = Long.MAX_VALUE;
> long worst = Long.MIN_VALUE;
> long totalTime = 0;
> int size = 5_000_000;
> int nWarmupTrials = 6;
> Random rand = new Random(1234);
> for (int n = 0; n < nTrials; n++) {
> if (n < nWarmupTrials)
> System.out.print("warmup... ");
> System.out.printf("%d/%d%n", n, nTrials);
> int[] numbers = new int[size];
> for (int i = 0; i < size; i++) {
> numbers[i] = rand.nextInt();
> }
> for (int i = 0; i < size; i++) {
> sum += (long) i *numbers[i];
> }
>
> long startTime = System.nanoTime();
> Arrays.sort(numbers);
> long endTime = System.nanoTime();
> long elapsed = endTime - startTime;
> //System.out.printf("  %f%n", (double) elapsed / 1e6);
> if (n < nWarmupTrials)
> continue;
> totalTime += elapsed;
> if (elapsed < best) best = elapsed;
> if (elapsed > worst) worst = elapsed;
> }
> int nEffectiveTrials = nTrials - nWarmupTrials;
> double avgTime = (double)totalTime / nEffectiveTrials;;
> double avgCTime = (double)(totalTime-worst)/(nEffectiveTrials-1);
> System.out.println("java.vm.version=" + System.getProperty
>
>  ("java.vm.version"));
> System.out.println("java.vm.name=" +
> System.getProperty("java.vm.name"));
> System.out.println("java.version=" +
> System.getProperty("java.version"));
> System.out.printf("(java) vector sort time %f ms%n", (double)
> best / 1e6);
> System.out.printf("nTrials=%d = %d (warmUp) + %d (effective)%n",
> nTrials, nWarmupTrials, nEffectiveTrials);
> System.out.printf("size=%d%n", size);
> System.out.printf("worst/best=%f%n", (double) worst / best);
> System.out.printf("avg/best=%f%n", avgTime / best);
> System.out.printf("corrected avg/best=%f%n", avgCTime / best);
> out.println("\n\n" + sum);
> }
> }
> ```
>
> Results
>
> C++ (Release mode, -O3):
> ```
> (c++) vector sort time 410.469 ms
> nTrials = 20
> size = 500
> worst/best = 1.0515
> avg/best = 1.01533
> corrected avg/best = 1.01342
> ```
>
> Java:
> `

RFD: switch/case statement dispatch using hash

2025-05-24 Thread Joern Wolfgang Rennecke

This has come up several time over the years:
https://gcc.gnu.org/legacy-ml/gcc/2006-07/msg00158.html
https://gcc.gnu.org/legacy-ml/gcc/2006-07/msg00155.html
https://gcc.gnu.org/pipermail/gcc/2010-March/190234.html

, but maybe now (or maybe a while ago) is the right time to do this, 
considering the changes in relative costs of basic operations?

Multiply and barrel shift are cheap on many modern microarchitectures.
Control flow and non-linear memory access is expensive.

FWIW, [Dietz92]
https://docs.lib.purdue.edu/cgi/viewcontent.cgi?article=1312&context=ecetr
mentions multiply in passing as unpractical for SPARC because of cost, 
but modern CPUs can often do a multiply in a single cycle.


Approximating division and scaling, for a case value x, we can calculate 
an index or offset into a table as

f(x) = x*C1 >> C2 & M

For an index, M masks off the upper bits so that the index fits into
a table that has a number of elements that is a power of two.
For architectures where a non-scaled index in cheaper to use than
a scaled one, we compute an offset by having M also mask off the lower
bits.

Each table entry contains a jump address (or offset) and a key - at 
least if both are the same size; for different sizes, it might be 
cheaper to have two tables.
If we have found values for C1 and C2 that give a perfect hash, we can 
immediately dispatch to the default case for a non-match; otherwise, we 
can have decision trees at the jump destinations, each using the 
comparison with the key from the table for the first decision.
No separate range check is necessary, so if the multiply is fast enough, 
this should be close in performance to an ordinary tablejump.


This dispatch method can be used for tables that are too sparse for a 
tablejump,but have enough cases to justify the overhead (depending on 
multiple conditional vs single indirect branch costs, the latter might 
be a low bar).


I suppose we could make tree-switch-conversion.cc use rtx costs to 
compare the hash implementation to a decision tree, or have a hook make 
the decision - and the default for the hook might use rtx costs...


gcc-15-20250524 is now available

2025-05-24 Thread GCC Administrator via Gcc
Snapshot gcc-15-20250524 is now available on
  https://gcc.gnu.org/pub/gcc/snapshots/15-20250524/
and on various mirrors, see https://gcc.gnu.org/mirrors.html for details.

This snapshot has been generated from the GCC 15 git branch
with the following options: git://gcc.gnu.org/git/gcc.git branch 
releases/gcc-15 revision 18f937f013b79f4280c66b05fa793a0780955081

You'll find:

 gcc-15-20250524.tar.xz   Complete GCC

  SHA256=8449307cbfe9c25a32eafd78fb4dd53f1c85dcf66fc2cfa8e3b30aeb70266508
  SHA1=f1272586f7592a3cc786f13fc360b597cc2e5b69

Diffs from 15-20250517 are available in the diffs/ subdirectory.

When a particular snapshot is ready for public consumption the LATEST-15
link is updated and a message is sent to the gcc list.  Please do not use
a snapshot before it has been announced that way.


Possible memory access optimization opportunity? Comparison with Java JIT

2025-05-24 Thread Andy via Gcc
Dear GCC developers,

I would like to ask whether there might be room for improvement in memory
access optimization in GCC.

I've prepared a simple benchmark in both C++ (using -std=c++20 for digit
separators like 5'000'000) and Java. The benchmark allocates a large array of
random integers, performs a multiply-sum loop to prevent
dead-code elimination, and measures the time taken to sort the array.
C++ example:

```cpp
#include 
#include 
#include 
#include 
#include 

int main() {
std::mt19937 rng(1234);
const int nTrials = 20;
long sum = 0;
long long best = std::numeric_limits::max();
long long worst = std::numeric_limits::min();
long  long totalTime = 0;
const int size = 5'000'000;
for (int n = 0; n dist(std::numeric_limits::min(),
   std::numeric_limits::max());
std::vector numbers(size);
for (int& num : numbers) {
num = dist(rng);
}
for (int i = 0; i < size; ++i) {
sum += i * numbers[i];
}
auto start = std::chrono::high_resolution_clock::now();
std::sort(numbers.begin(), numbers.end());
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast
  (end - start).count();
totalTime +=  elapsed;
if (elapsed < best) best = elapsed;
if (elapsed > worst) worst = elapsed;
}
double avgTime = (double)totalTime / nTrials;
double avgCTime = (double)(totalTime-worst)/(nTrials-1);;
std::cout << "(c++) vector sort time " << best / 1e6 << " ms\n";
std::cout << "nTrials = " << nTrials << "\n";
std::cout << "size = " << size << "\n";
std::cout << "worst/best = " << (double)worst/best << "\n";
std::cout << "avg/best = " << avgTime / best << "\n";
std::cout << "corrected avg/best = " << avgCTime / best << "\n";
std::cout << "\n\nresult for prevent eliminate dead code: " << sum << "\n";
return 0;
}
```

Java example:

```java
import java.util.Arrays;
import java.util.Random;

import static java.lang.System.out;

public class Main {
public static void main(String[] args) {
int nTrials = 50;
long sum = 0;
long best = Long.MAX_VALUE;
long worst = Long.MIN_VALUE;
long totalTime = 0;
int size = 5_000_000;
int nWarmupTrials = 6;
Random rand = new Random(1234);
for (int n = 0; n < nTrials; n++) {
if (n < nWarmupTrials)
System.out.print("warmup... ");
System.out.printf("%d/%d%n", n, nTrials);
int[] numbers = new int[size];
for (int i = 0; i < size; i++) {
numbers[i] = rand.nextInt();
}
for (int i = 0; i < size; i++) {
sum += (long) i *numbers[i];
}

long startTime = System.nanoTime();
Arrays.sort(numbers);
long endTime = System.nanoTime();
long elapsed = endTime - startTime;
//System.out.printf("  %f%n", (double) elapsed / 1e6);
if (n < nWarmupTrials)
continue;
totalTime += elapsed;
if (elapsed < best) best = elapsed;
if (elapsed > worst) worst = elapsed;
}
int nEffectiveTrials = nTrials - nWarmupTrials;
double avgTime = (double)totalTime / nEffectiveTrials;;
double avgCTime = (double)(totalTime-worst)/(nEffectiveTrials-1);
System.out.println("java.vm.version=" + System.getProperty
   ("java.vm.version"));
System.out.println("java.vm.name=" +
System.getProperty("java.vm.name"));
System.out.println("java.version=" +
System.getProperty("java.version"));
System.out.printf("(java) vector sort time %f ms%n", (double)
best / 1e6);
System.out.printf("nTrials=%d = %d (warmUp) + %d (effective)%n",
nTrials, nWarmupTrials, nEffectiveTrials);
System.out.printf("size=%d%n", size);
System.out.printf("worst/best=%f%n", (double) worst / best);
System.out.printf("avg/best=%f%n", avgTime / best);
System.out.printf("corrected avg/best=%f%n", avgCTime / best);
out.println("\n\n" + sum);
}
}
```

Results

C++ (Release mode, -O3):
```
(c++) vector sort time 410.469 ms
nTrials = 20
size = 500
worst/best = 1.0515
avg/best = 1.01533
corrected avg/best = 1.01342
```

Java:
```
java.vm.version=24.0.1+9-30
java.vm.name=OpenJDK 64-Bit Server VM
java.version=24.0.1
(java) vector sort time 69.322373 ms
nTrials=50 = 6 (warmUp) + 44 (effective)
size=500
worst/best=1.353855
avg/best=1.094769
corrected avg/best=1.088744
```
Notes:

The JVM was run with the following options to enable compilation logging
and assembly output:

-XX:+UnlockDiagnosticVMOptions -XX:+LogCompilation -XX:+PrintAssembly

This produces a file lik