Re: RFR: 8355216: Accelerate P-256 arithmetic on aarch64 [v5]

Ferenc Rakoczi Fri, 22 May 2026 06:07:23 -0700

On Mon, 18 May 2026 09:10:20 GMT, Andrew Dinn <[email protected]> wrote:


>> Ferenc Rakoczi has updated the pull request incrementally with one 
>> additional commit since the last revision:
>> 
>>   Accepting more suggestions from Andrew Dinn.
>
> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7863:
> 
>> 7861:     __ sub(sp, sp, 128);
>> 7862:     __ mov(mul_ptr, sp);
>> 7863: 
> 
> Suggestion:
> 
>     // neon and gpr computations are interleaved to maximize parallelism

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7864:
> 
>> 7862:     __ mov(mul_ptr, sp);
>> 7863: 
>> 7864:     neon_partial_mult_64(A, b_lows, a_vals, 0);
> 
> Suggestion:
> 
>     // cross-multiply low * low for limbs b0-b3 and a3-a4 in parallel
>     neon_partial_mult_64(A, b_lows, a_vals, 0);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7871:
> 
>> 7869:     __ andr(n, low, limb_mask);
>> 7870: 
>> 7871:     neon_partial_mult_64(B, b_highs, a_vals, 0);
> 
> Suggestion:
> 
>     // cross-multiply high * low for limbs b0-b3 and a3-a4 in parallel
>     neon_partial_mult_64(B, b_highs, a_vals, 0);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7880:
> 
>> 7878:     __ add(c_i, c_i, high);
>> 7879: 
>> 7880:     neon_partial_mult_64(C, b_lows, a_vals, 1);
> 
> Suggestion:
> 
>     // cross-multiply low * high for limbs b0-b3 and a3-a4 in parallel
>     neon_partial_mult_64(C, b_lows, a_vals, 1);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7885:
> 
>> 7883:     gpr_partial_mult_52(a_i, b_1, high, low, tmp, limb_mask);
>> 7884: 
>> 7885:     neon_partial_mult_64(D, b_highs, a_vals, 1);
> 
> Suggestion:
> 
>     // cross-multiply high * high for limbs b0-b3 and a3-a4 in parallel
>     neon_partial_mult_64(D, b_highs, a_vals, 1);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7894:
> 
>> 7892:     __ mov(c_i, high);
>> 7893: 
>> 7894:     vs_addv(B, __ T2D, B, C); // Store (B+C) in B
> 
> Suggestion:
> 
>     // combine neon 32-bit partial products, regrouping to produce
>     // 8*52-bit low products in A and 8*52-bit high products in D
> 
>     // add low*high/high*low intermediate products before regrouping
>     vs_addv(B, __ T2D, B, C); // Store (B+C) in B

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7902:
> 
>> 7900:     __ mov(c_i, high);
>> 7901: 
>> 7902:     vs_shl(D, __ T2D, D, montMulP256Shift1);
> 
> Suggestion:
> 
>     // shift high*high (40-bit) product up into 52-bits of output
>     vs_shl(D, __ T2D, D, montMulP256Shift1);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7907:
> 
>> 7905:     gpr_partial_mult_52(a_i, b_3, high, low, tmp, limb_mask);
>> 7906: 
>> 7907:     vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) 
>> >>> 20)
> 
> Suggestion:
> 
>     // shift high 32 (or 33) bits of intermediate products for addition to D
>     vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) >>> 20)

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7916:
> 
>> 7914:     __ mov(c_i, high);
>> 7915: 
>> 7916:     vs_shl(B, __ T2D, B, 32);
> 
> Suggestion:
> 
>     // shift low 32 bits of intermediate product up for masking and addition 
> to A
>     vs_shl(B, __ T2D, B, 32);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7921:
> 
>> 7919:     gpr_partial_mult_52(a_i, b_4, high, low, tmp, limb_mask);
>> 7920: 
>> 7921:     vs_addv(D, __ T2D, D, C);
> 
> Suggestion:
> 
>     // add high bits of intermediate product into D 
>     vs_addv(D, __ T2D, D, C);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7930:
> 
>> 7928:     __ str(high, Address(c_ptr, 32));
>> 7929: 
>> 7930:     vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
> 
> Suggestion:
> 
>     // top 12 bits of 32*32 bit product in A need adding into high 56-bit 
> output
>     vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7931:
> 
>> 7929: 
>> 7930:     vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
>> 7931:     vs_andr(B, B, limb_mask_vec);
> 
> Suggestion:
> 
>     // Only 20 of the 32 bits now in the top of B should be added into A
>     vs_andr(B, B, limb_mask_vec);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7932:
> 
>> 7930:     vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
>> 7931:     vs_andr(B, B, limb_mask_vec);
>> 7932:     vs_andr(A, A, limb_mask_vec);
> 
> Suggestion:
> 
>     // reduce original 64-bit product to 52-bits
>     vs_andr(A, A, limb_mask_vec);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7933:
> 
>> 7931:     vs_andr(B, B, limb_mask_vec);
>> 7932:     vs_andr(A, A, limb_mask_vec);
>> 7933:     vs_addv(D, __ T2D, D, C);
> 
> Suggestion:
> 
>     // add intermediate products to high 52-bit result in D
>     vs_addv(D, __ T2D, D, C);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7934:
> 
>> 7932:     vs_andr(A, A, limb_mask_vec);
>> 7933:     vs_addv(D, __ T2D, D, C);
>> 7934:     vs_addv(A, __ T2D, A, B);
> 
> Suggestion:
> 
>     // add 20/21 bits of intermediate product in top of B into low 52-bit 
> result
>     vs_addv(A, __ T2D, A, B);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7936:
> 
>> 7934:     vs_addv(A, __ T2D, A, B);
>> 7935:     vs_ushr(B, __ T2D, A, montMulP256Shift2);
>> 7936:     vs_andr(A, A, limb_mask_vec);
> 
> Suggestion:
> 
>     // save and then mask off any overflow bit from computing low 52-bit 
> result
>     vs_ushr(B, __ T2D, A, montMulP256Shift2);
>     vs_andr(A, A, limb_mask_vec);

Accepted.

> src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7937:
> 
>> 7935:     vs_ushr(B, __ T2D, A, montMulP256Shift2);
>> 7936:     vs_andr(A, A, limb_mask_vec);
>> 7937:     vs_addv(D, __ T2D, D, B);
> 
> Suggestion:
> 
>     // add any remaining carry into the high 52-bit result
>     vs_addv(D, __ T2D, D, B);

Accepted.

-------------

PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288502224
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288502832
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288503636
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288504371
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288505283
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288506003
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288506839
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288507293
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288508193
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288509570
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288510285
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288516819
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288517510
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288518070
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288520182
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288520952
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288518690

Re: RFR: 8355216: Accelerate P-256 arithmetic on aarch64 [v5]

Reply via email to