On Mon, 18 May 2026 09:10:20 GMT, Andrew Dinn <[email protected]> wrote:
>> Ferenc Rakoczi has updated the pull request incrementally with one >> additional commit since the last revision: >> >> Accepting more suggestions from Andrew Dinn. > > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7863: > >> 7861: __ sub(sp, sp, 128); >> 7862: __ mov(mul_ptr, sp); >> 7863: > > Suggestion: > > // neon and gpr computations are interleaved to maximize parallelism Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7864: > >> 7862: __ mov(mul_ptr, sp); >> 7863: >> 7864: neon_partial_mult_64(A, b_lows, a_vals, 0); > > Suggestion: > > // cross-multiply low * low for limbs b0-b3 and a3-a4 in parallel > neon_partial_mult_64(A, b_lows, a_vals, 0); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7871: > >> 7869: __ andr(n, low, limb_mask); >> 7870: >> 7871: neon_partial_mult_64(B, b_highs, a_vals, 0); > > Suggestion: > > // cross-multiply high * low for limbs b0-b3 and a3-a4 in parallel > neon_partial_mult_64(B, b_highs, a_vals, 0); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7880: > >> 7878: __ add(c_i, c_i, high); >> 7879: >> 7880: neon_partial_mult_64(C, b_lows, a_vals, 1); > > Suggestion: > > // cross-multiply low * high for limbs b0-b3 and a3-a4 in parallel > neon_partial_mult_64(C, b_lows, a_vals, 1); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7885: > >> 7883: gpr_partial_mult_52(a_i, b_1, high, low, tmp, limb_mask); >> 7884: >> 7885: neon_partial_mult_64(D, b_highs, a_vals, 1); > > Suggestion: > > // cross-multiply high * high for limbs b0-b3 and a3-a4 in parallel > neon_partial_mult_64(D, b_highs, a_vals, 1); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7894: > >> 7892: __ mov(c_i, high); >> 7893: >> 7894: vs_addv(B, __ T2D, B, C); // Store (B+C) in B > > Suggestion: > > // combine neon 32-bit partial products, regrouping to produce > // 8*52-bit low products in A and 8*52-bit high products in D > > // add low*high/high*low intermediate products before regrouping > vs_addv(B, __ T2D, B, C); // Store (B+C) in B Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7902: > >> 7900: __ mov(c_i, high); >> 7901: >> 7902: vs_shl(D, __ T2D, D, montMulP256Shift1); > > Suggestion: > > // shift high*high (40-bit) product up into 52-bits of output > vs_shl(D, __ T2D, D, montMulP256Shift1); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7907: > >> 7905: gpr_partial_mult_52(a_i, b_3, high, low, tmp, limb_mask); >> 7906: >> 7907: vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) >> >>> 20) > > Suggestion: > > // shift high 32 (or 33) bits of intermediate products for addition to D > vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) >>> 20) Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7916: > >> 7914: __ mov(c_i, high); >> 7915: >> 7916: vs_shl(B, __ T2D, B, 32); > > Suggestion: > > // shift low 32 bits of intermediate product up for masking and addition > to A > vs_shl(B, __ T2D, B, 32); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7921: > >> 7919: gpr_partial_mult_52(a_i, b_4, high, low, tmp, limb_mask); >> 7920: >> 7921: vs_addv(D, __ T2D, D, C); > > Suggestion: > > // add high bits of intermediate product into D > vs_addv(D, __ T2D, D, C); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7930: > >> 7928: __ str(high, Address(c_ptr, 32)); >> 7929: >> 7930: vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52) > > Suggestion: > > // top 12 bits of 32*32 bit product in A need adding into high 56-bit > output > vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52) Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7931: > >> 7929: >> 7930: vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52) >> 7931: vs_andr(B, B, limb_mask_vec); > > Suggestion: > > // Only 20 of the 32 bits now in the top of B should be added into A > vs_andr(B, B, limb_mask_vec); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7932: > >> 7930: vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52) >> 7931: vs_andr(B, B, limb_mask_vec); >> 7932: vs_andr(A, A, limb_mask_vec); > > Suggestion: > > // reduce original 64-bit product to 52-bits > vs_andr(A, A, limb_mask_vec); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7933: > >> 7931: vs_andr(B, B, limb_mask_vec); >> 7932: vs_andr(A, A, limb_mask_vec); >> 7933: vs_addv(D, __ T2D, D, C); > > Suggestion: > > // add intermediate products to high 52-bit result in D > vs_addv(D, __ T2D, D, C); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7934: > >> 7932: vs_andr(A, A, limb_mask_vec); >> 7933: vs_addv(D, __ T2D, D, C); >> 7934: vs_addv(A, __ T2D, A, B); > > Suggestion: > > // add 20/21 bits of intermediate product in top of B into low 52-bit > result > vs_addv(A, __ T2D, A, B); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7936: > >> 7934: vs_addv(A, __ T2D, A, B); >> 7935: vs_ushr(B, __ T2D, A, montMulP256Shift2); >> 7936: vs_andr(A, A, limb_mask_vec); > > Suggestion: > > // save and then mask off any overflow bit from computing low 52-bit > result > vs_ushr(B, __ T2D, A, montMulP256Shift2); > vs_andr(A, A, limb_mask_vec); Accepted. > src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7937: > >> 7935: vs_ushr(B, __ T2D, A, montMulP256Shift2); >> 7936: vs_andr(A, A, limb_mask_vec); >> 7937: vs_addv(D, __ T2D, D, B); > > Suggestion: > > // add any remaining carry into the high 52-bit result > vs_addv(D, __ T2D, D, B); Accepted. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288502224 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288502832 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288503636 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288504371 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288505283 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288506003 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288506839 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288507293 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288508193 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288509570 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288510285 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288516819 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288517510 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288518070 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288520182 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288520952 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3288518690
