https://gcc.gnu.org/g:a48c0437d5c6b90ec6be5203a8aaad48a88d83a3
commit a48c0437d5c6b90ec6be5203a8aaad48a88d83a3 Author: Michael Meissner <meiss...@linux.ibm.com> Date: Thu Mar 27 21:20:13 2025 -0400 Update ChangeLog.* Diff: --- gcc/ChangeLog.vpair | 420 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 420 insertions(+) diff --git a/gcc/ChangeLog.vpair b/gcc/ChangeLog.vpair index da61dc2ec8af..c01d76e81005 100644 --- a/gcc/ChangeLog.vpair +++ b/gcc/ChangeLog.vpair @@ -1,5 +1,425 @@ +==================== Branch work198-vpair, patch #500 ==================== + +Vector pair support. + +This patch adds a new include file (vector-pair.h) that adds support so that +users writing high performance libraries can change their code to allow the +generation of the vector pair load and store instructions on power10. + +The intention is that if the library authors need to write special loops that +go over arrays that they could modify their code to use the functions provided +to change loops that can take advantage of the higher bandwidth for load vector +pair and store instructions. + +This particular patch just adds a new include file (vector-pair.h) that +provides a bunch of functions that on a power10 system would use the vector +pair load operation, 2 floating point operations, and a vector pair store. It +does not add any new types, modes, or built-in function. + +I have additional patches that can add built-in functions that the functions in +vector-pair.h could utilize so that the compiler can optimize and combine +operations. I may submit those patches in the future, but I would like to +provide this patch to allow the library writer to optimize their code. + +I've measured the performance of these new functions on a power10. For default +unrolling, the percentage of change for the 3 methods over the normal vector +loop method: + + 116% Vector-pair.h function, default unroll + 93% Vector pair split built-in & 2 vector stores, default unroll + 86% Vector pair split & combine built-ins, default unroll + +Using explicit 2 way unrolling the numbers are: + + 114% Vector-pair.h function, unroll 2 + 106% Vector pair split built-in & 2 vector stores, unroll 2 + 98% Vector pair split & combine built-ins, unroll 2 + +These new functions provided in vector-pair.h use the vector pair load/store +instructions, and don't generate extra vector moves. Using the existing +vector pair disassemble and assemble built-ins generate extra vector moves +which can hinder performance. + +If I compile the loop code for power9, there is a minor speed up for default +unrolling and more of an improvement using the framework provided in the +vector-pair.h for explicit unrolling by 2: + + 101% Vector-pair.h function, default unroll for power9 + 107% Vector-pair.h function, unroll 2 for power9 + +Of course this is a synthetic benchmark run on a quiet power10 system. Results +would vary for real code on real systems. However, I feel adding these +functions can allow the writers of high performance libraries to better +optimize their code. + +As an example, if the library wants to code a simple fused multiply-add loop, +they might write the code as follows: + + #include <altivec.h> + #include <math.h> + #include <stddef.h> + + void + fma_vector (double * __restrict__ r, + const double * __restrict__ a, + const double * __restrict__ b, + size_t n) + { + vector double * __restrict__ vr = (vector double * __restrict__)r; + const vector double * __restrict__ va = (const vector double * __restrict__)a; + const vector double * __restrict__ vb = (const vector double * __restrict__)b; + size_t num_elements = sizeof (vector double) / sizeof (double); + size_t nv = n / num_elements; + size_t i; + + for (i = 0; i < nv; i++) + vr[i] = __builtin_vsx_xvmadddp (va[i], vb[i], vr[i]); + + for (i = nv * num_elements; i < n; i++) + r[i] = fma (a[i], b[i], r[i]); + } + +The inner loop would look like: + + .L3: + lxvx 0,3,9 + lxvx 12,4,9 + addi 10,9,16 + addi 2,2,-2 + lxvx 11,5,9 + xvmaddadp 0,12,11 + lxvx 12,4,10 + lxvx 11,5,10 + stxvx 0,3,9 + lxvx 0,3,10 + addi 9,9,32 + xvmaddadp 0,12,11 + stxvx 0,3,10 + bdnz .L3 + +Now if you code the loop to use __builtin_vsx_disassemble_pair to do a vector +pair load, but then do 2 vector stores: + + + #include <altivec.h> + #include <math.h> + #include <stddef.h> + + void + fma_mma_ld (double * __restrict__ r, + const double * __restrict__ a, + const double * __restrict__ b, + size_t n) + { + __vector_pair * __restrict__ vp_r = (__vector_pair * __restrict__)r; + const __vector_pair * __restrict__ vp_a = (const __vector_pair * __restrict__)a; + const __vector_pair * __restrict__ vp_b = (const __vector_pair * __restrict__)b; + vector double * __restrict__ v_r = (vector double * __restrict__)r; + size_t num_elements = (sizeof (__vector_pair) / sizeof (double)); + size_t n_vp = n / num_elements; + size_t i, j; + vector double a_hi_lo[2]; + vector double b_hi_lo[2]; + vector double r_hi_lo[2]; + vector double result_hi, result_lo; + + j = 0; + for (i = 0; i < n_vp; i++) + { + __builtin_vsx_disassemble_pair (&a_hi_lo[0], &vp_a[i]); + __builtin_vsx_disassemble_pair (&b_hi_lo[0], &vp_b[i]); + __builtin_vsx_disassemble_pair (&r_hi_lo[0], &vp_r[i]); + + result_hi = __builtin_vsx_xvmadddp (a_hi_lo[0], b_hi_lo[0], r_hi_lo[0]); + result_lo = __builtin_vsx_xvmadddp (a_hi_lo[1], b_hi_lo[1], r_hi_lo[1]); + + v_r[ j+0 ] = result_hi; + v_r[ j+1 ] = result_lo; + j += 2; + } + + for (i = n_vp * num_elements; i < n; i++) + r[i] = fma (a[i], b[i], r[i]); + } + +And the inner loop would looke like: + + .L72: + lxvpx 10,4,2 + lxvpx 0,5,2 + lxvpx 12,3,2 + xxlor 8,11,11 + xxlor 11,1,1 + xvmaddmdp 0,10,12 + xvmaddmdp 11,8,13 + stxvx 11,3,2 + stxvx 0,9,2 + addi 2,2,32 + bdnz .L72 + +I.e. it does 3 vector pair loads, but it adds 2 extra vector moves in the loop. +Also, normal unrolling does not unroll this loop. But you can use #pragma GCC +unroll 2 to explicitly unroll the loop, and it would generate: + + .L97: + lxvpx 6,3,2 + addi 9,2,32 + lxvpx 12,4,2 + lxvpx 4,5,2 + lxvpx 8,5,9 + lxvpx 10,3,9 + lxvpx 0,4,9 + xxlor 32,13,13 + xxlor 13,7,7 + xvmaddmdp 12,4,6 + xxlor 7,9,9 + xxlor 9,13,13 + xvmaddmdp 0,8,10 + xvmaddadp 9,5,32 + xvmaddadp 11,7,1 + stxvx 9,3,2 + stxvx 12,10,2 + addi 2,2,64 + stxvx 11,3,9 + stxvx 0,10,9 + bdnz .L97 + +I.e. it now adds 4 extra vector moves instead of 2, + +If you try to do vector pair loads, split the vector pairs into separate +vectors, do the fma, and then combine the two vector resultss back into a +vector pair, the code might look like: + + #include <altivec.h> + #include <math.h> + #include <stddef.h> + + void + fma_mma_ld_st (double * __restrict__ r, + const double * __restrict__ a, + const double * __restrict__ b, + size_t n) + { + __vector_pair * __restrict__ vp_r = (__vector_pair * __restrict__)r; + const __vector_pair * __restrict__ vp_a = (const __vector_pair * __restrict__)a; + const __vector_pair * __restrict__ vp_b = (const __vector_pair * __restrict__)b; + size_t num_elements = (sizeof (__vector_pair) / sizeof (double)); + size_t n_vp = n / num_elements; + size_t i; + union vec_alias { + vector double vd; + vector unsigned char vuc; + }; + vector double a_hi_lo[2]; + vector double b_hi_lo[2]; + vector double r_hi_lo[2]; + union vec_alias result_hi, result_lo; + + for (i = 0; i < n_vp; i++) + { + __builtin_vsx_disassemble_pair (&a_hi_lo[0], &vp_a[i]); + __builtin_vsx_disassemble_pair (&b_hi_lo[0], &vp_b[i]); + __builtin_vsx_disassemble_pair (&r_hi_lo[0], &vp_r[i]); + + result_hi.vd = __builtin_vsx_xvmadddp (a_hi_lo[0], b_hi_lo[0], r_hi_lo[0]); + result_lo.vd = __builtin_vsx_xvmadddp (a_hi_lo[1], b_hi_lo[1], r_hi_lo[1]); + + __builtin_vsx_build_pair (&vp_r[i], result_hi.vuc, result_lo.vuc); + } + + for (i = n_vp * num_elements; i < n; i++) + r[i] = fma (a[i], b[i], r[i]); + } + +The inner loop would look like: + + .L128: + lxvpx 10,4,2 + lxvpx 0,5,2 + lxvpx 12,3,2 + xxlor 9,10,10 + xxlor 10,11,11 + xxlor 11,1,1 + xvmaddmdp 0,9,12 + xvmaddmdp 11,10,13 + xxlor 12,0,0 + xxlor 13,11,11 + stxvpx 12,3,2 + addi 2,2,32 + bdnz .L128 + +I.e. there are now 3 extra vector moves after the load vector pair instruction, +and 2 vector moves to combine the vector back into a vector pair. + +If you use an explicit #pragma GCC unroll 2, the code generated would be: + + .L153: + lxvpx 10,3,2 + addi 9,2,32 + lxvpx 6,4,2 + lxvpx 8,5,2 + lxvpx 12,5,9 + lxvpx 0,4,9 + xxlor 3,11,11 + xxlor 5,6,6 + xxlor 6,7,7 + xxlor 7,9,9 + xxlor 11,12,12 + xxlor 12,3,3 + xvmaddadp 10,5,8 + xxlor 9,13,13 + xvmaddadp 12,7,6 + xxlor 6,10,10 + xxlor 7,12,12 + stxvpx 6,3,2 + addi 2,2,64 + lxvpx 12,3,9 + xxlor 10,12,12 + xxlor 12,13,13 + xvmaddmdp 0,11,10 + xvmaddadp 12,9,1 + xxlor 10,0,0 + xxlor 11,12,12 + stxvpx 10,3,9 + bdnz .L153 + +Finally, if you recode the loop to use the vpair_f64_fma function in this +patch, the code would look like: + + #include <altivec.h> + #include <math.h> + #include <vector-pair.h> + #include <stddef.h> + + void + fma_vpair (double * __restrict__ r, + const double * __restrict__ a, + const double * __restrict__ b, + size_t n) + { + vector_pair_f64_t * __restrict__ vp_r = (vector_pair_f64_t * __restrict__)r; + const vector_pair_f64_t * __restrict__ vp_a = (const vector_pair_f64_t * __restrict__)a; + const vector_pair_f64_t * __restrict__ vp_b = (const vector_pair_f64_t * __restrict__)b; + size_t num_elements = (sizeof (vector_pair_f64_t) / sizeof (double)); + size_t n_vp = n / num_elements; + size_t i; + + for (i = 0; i < n_vp; i++) + vpair_f64_fma (&vp_r[i], &vp_a[i], &vp_b[i], &vp_r[i]); + + for (i = n_vp * num_elements; i < n; i++) + r[i] = fma (a[i], b[i], r[i]); + } + +The inner loop would generate: + + .L184: + addi 9,2,32 + lxvpx 0,3,2 + lxvpx 8,4,2 + lxvpx 6,5,2 + lxvpx 12,4,9 + lxvpx 10,5,9 + #APP + # 437 "./include/vector-pair.h" 1 + xvmaddadp 0,8,6 + xvmaddadp 0+1,8+1,6+1 + # 0 "" 2 + #NO_APP + stxvpx 0,3,2 + addi 2,2,64 + lxvpx 0,3,9 + #APP + # 437 "./include/vector-pair.h" 1 + xvmaddadp 0,12,10 + xvmaddadp 0+1,12+1,10+1 + # 0 "" 2 + #NO_APP + stxvpx 0,3,9 + bdnz .L184 + +I.e. there are no extra vector moves in this loop, and normal unrolling does +duplicate this loop. + +The vector-pair.h include file provides support if the code is compiled on +previous VSX systems that don't have the vector pair load/store instructions. +This allows the library writer to use the same code on both power9 and power10 +systems, without have to use #ifdef operations. On a power9, the code +generated would be: + + .L66: + lxvx 0,3,9 + lxvx 12,4,9 + lxvx 11,5,9 + xvmaddadp 0,12,11 + lxvx 12,7,9 + lxvx 11,8,9 + stxvx 0,3,9 + lxvx 0,10,9 + xvmaddadp 0,12,11 + stxvx 0,10,9 + addi 9,9,32 + bdnz .L66 + +With an explicit #pragma GCC unroll 2, the code generated would be: + + .L93: + lxvx 0,3,9 + lxvx 12,4,9 + addi 10,9,32 + lxvx 11,5,9 + xvmaddadp 0,12,11 + lxvx 12,7,9 + lxvx 11,11,9 + stxvx 0,3,9 + lxvx 0,8,9 + xvmaddadp 0,12,11 + lxvx 12,4,10 + lxvx 11,5,10 + stxvx 0,8,9 + addi 9,9,64 + lxvx 0,3,10 + xvmaddadp 0,12,11 + lxvx 12,7,10 + lxvx 11,11,10 + stxvx 0,3,10 + lxvx 0,8,10 + xvmaddadp 0,12,11 + stxvx 0,8,10 + bdnz .L93 + + +2025-03-27 Michael Meissner <meiss...@linux.ibm.com> + +gcc/ + + * config.gcc (powerpc*-*-*): Add vector-pair.h to extra headers. + * config/rs6000/vector-pair.h: New file. + * doc/extend.texi (PowerPC Vector Pair Support): Document the vector + pair support functions. + +gcc/testsuite/ + + * gcc.target/powerpc/vpair-1.c: New test or include file. + * gcc.target/powerpc/vpair-2.c: Likewise. + * gcc.target/powerpc/vpair-3-not-p10.c: Likewise. + * gcc.target/powerpc/vpair-3-p10.c: Likewise. + * gcc.target/powerpc/vpair-3.h: Likewise. + * gcc.target/powerpc/vpair-4-not-p10.c: Likewise. + * gcc.target/powerpc/vpair-4-p10.c: Likewise. + * gcc.target/powerpc/vpair-4.h: Likewise. + ==================== Branch work198-vpair, baseline ==================== +Add ChangeLog.vpair and update REVISION. + +2025-03-27 Michael Meissner <meiss...@linux.ibm.com> + +gcc/ + + * ChangeLog.vpair: New file for branch. + * REVISION: Update. + 2025-03-27 Michael Meissner <meiss...@linux.ibm.com> Clone branch