This is the 2nd version of the patch to add a new include file (vector-pair.h) that adds support so that users writing high performance libraries can change their code to allow the generation of the vector pair load and store instructions on power10.
The previous version of the patch was posted at: https://gcc.gnu.org/pipermail/gcc-patches/2024-November/669139.html The only difference is this patch assumes that GCC compilers on non-Linux systems and non-GCC compilers can use arithmetic on the register number for the __asm__ statements in power10/power11 so the __asm__ can access the 2nd register in a vector pair. I.e. to do a vector pair of vector double addition you can write: __asm__ ("xvadddp %x0,%x1,%x2\n\txvadddp %x0+1,%x1+1,%x2+2" : "=wa" (a) : "wa" (b), "wa" (c)); Otherwise on these non Linux and non GCC uses, we would have to write the following. This means the vector pairs can only be allocated to the traditional floating point register overlaid on the vector registers: __asm__ ("xvadddp %0,%1,%2\n\txvadddp %L0,%L1,%L2" : "=d" (a) : "d" (b), "d" (c)); The intention is that if the library authors need to write special loops that go over arrays that they could modify their code to use the functions provided to change loops that can take advantage of the higher bandwidth for load vector pair and store instructions. This particular patch just adds a new include file (vector-pair.h) that provides a bunch of functions that on a power10 system would use the vector pair load operation, 2 floating point operations, and a vector pair store. It does not add any new types, modes, or built-in function. I have additional patches that can add built-in functions that the functions in vector-pair.h could utilize so that the compiler can optimize and combine operations. I may submit those patches in the future, but I would like to provide this patch to allow the library writer to optimize their code. I've measured the performance of these new functions on a power10. For default unrolling, the percentage of change for the 3 methods over the normal vector loop method: 116% Vector-pair.h function, default unroll 93% Vector pair split built-in & 2 vector stores, default unroll 86% Vector pair split & combine built-ins, default unroll Using explicit 2 way unrolling the numbers are: 114% Vector-pair.h function, unroll 2 106% Vector pair split built-in & 2 vector stores, unroll 2 98% Vector pair split & combine built-ins, unroll 2 These new functions provided in vector-pair.h use the vector pair load/store instructions, and don't generate extra vector moves. Using the existing vector pair disassemble and assemble built-ins generate extra vector moves which can hinder performance. If I compile the loop code for power9, there is a minor speed up for default unrolling and more of an improvement using the framework provided in the vector-pair.h for explicit unrolling by 2: 101% Vector-pair.h function, default unroll for power9 107% Vector-pair.h function, unroll 2 for power9 Of course this is a synthetic benchmark run on a quiet power10 system. Results would vary for real code on real systems. However, I feel adding these functions can allow the writers of high performance libraries to better optimize their code. As an example, if the library wants to code a simple fused multiply-add loop, they might write the code as follows: #include <altivec.h> #include <math.h> #include <stddef.h> void fma_vector (double * __restrict__ r, const double * __restrict__ a, const double * __restrict__ b, size_t n) { vector double * __restrict__ vr = (vector double * __restrict__)r; const vector double * __restrict__ va = (const vector double * __restrict__)a; const vector double * __restrict__ vb = (const vector double * __restrict__)b; size_t num_elements = sizeof (vector double) / sizeof (double); size_t nv = n / num_elements; size_t i; for (i = 0; i < nv; i++) vr[i] = __builtin_vsx_xvmadddp (va[i], vb[i], vr[i]); for (i = nv * num_elements; i < n; i++) r[i] = fma (a[i], b[i], r[i]); } The inner loop would look like: .L3: lxvx 0,3,9 lxvx 12,4,9 addi 10,9,16 addi 2,2,-2 lxvx 11,5,9 xvmaddadp 0,12,11 lxvx 12,4,10 lxvx 11,5,10 stxvx 0,3,9 lxvx 0,3,10 addi 9,9,32 xvmaddadp 0,12,11 stxvx 0,3,10 bdnz .L3 Now if you code the loop to use __builtin_vsx_disassemble_pair to do a vector pair load, but then do 2 vector stores: #include <altivec.h> #include <math.h> #include <stddef.h> void fma_mma_ld (double * __restrict__ r, const double * __restrict__ a, const double * __restrict__ b, size_t n) { __vector_pair * __restrict__ vp_r = (__vector_pair * __restrict__)r; const __vector_pair * __restrict__ vp_a = (const __vector_pair * __restrict__)a; const __vector_pair * __restrict__ vp_b = (const __vector_pair * __restrict__)b; vector double * __restrict__ v_r = (vector double * __restrict__)r; size_t num_elements = (sizeof (__vector_pair) / sizeof (double)); size_t n_vp = n / num_elements; size_t i, j; vector double a_hi_lo[2]; vector double b_hi_lo[2]; vector double r_hi_lo[2]; vector double result_hi, result_lo; j = 0; for (i = 0; i < n_vp; i++) { __builtin_vsx_disassemble_pair (&a_hi_lo[0], &vp_a[i]); __builtin_vsx_disassemble_pair (&b_hi_lo[0], &vp_b[i]); __builtin_vsx_disassemble_pair (&r_hi_lo[0], &vp_r[i]); result_hi = __builtin_vsx_xvmadddp (a_hi_lo[0], b_hi_lo[0], r_hi_lo[0]); result_lo = __builtin_vsx_xvmadddp (a_hi_lo[1], b_hi_lo[1], r_hi_lo[1]); v_r[ j+0 ] = result_hi; v_r[ j+1 ] = result_lo; j += 2; } for (i = n_vp * num_elements; i < n; i++) r[i] = fma (a[i], b[i], r[i]); } And the inner loop would looke like: .L72: lxvpx 10,4,2 lxvpx 0,5,2 lxvpx 12,3,2 xxlor 8,11,11 xxlor 11,1,1 xvmaddmdp 0,10,12 xvmaddmdp 11,8,13 stxvx 11,3,2 stxvx 0,9,2 addi 2,2,32 bdnz .L72 I.e. it does 3 vector pair loads, but it adds 2 extra vector moves in the loop. Also, normal unrolling does not unroll this loop. But you can use #pragma GCC unroll 2 to explicitly unroll the loop, and it would generate: .L97: lxvpx 6,3,2 addi 9,2,32 lxvpx 12,4,2 lxvpx 4,5,2 lxvpx 8,5,9 lxvpx 10,3,9 lxvpx 0,4,9 xxlor 32,13,13 xxlor 13,7,7 xvmaddmdp 12,4,6 xxlor 7,9,9 xxlor 9,13,13 xvmaddmdp 0,8,10 xvmaddadp 9,5,32 xvmaddadp 11,7,1 stxvx 9,3,2 stxvx 12,10,2 addi 2,2,64 stxvx 11,3,9 stxvx 0,10,9 bdnz .L97 I.e. it now adds 4 extra vector moves instead of 2, If you try to do vector pair loads, split the vector pairs into separate vectors, do the fma, and then combine the two vector resultss back into a vector pair, the code might look like: #include <altivec.h> #include <math.h> #include <stddef.h> void fma_mma_ld_st (double * __restrict__ r, const double * __restrict__ a, const double * __restrict__ b, size_t n) { __vector_pair * __restrict__ vp_r = (__vector_pair * __restrict__)r; const __vector_pair * __restrict__ vp_a = (const __vector_pair * __restrict__)a; const __vector_pair * __restrict__ vp_b = (const __vector_pair * __restrict__)b; size_t num_elements = (sizeof (__vector_pair) / sizeof (double)); size_t n_vp = n / num_elements; size_t i; union vec_alias { vector double vd; vector unsigned char vuc; }; vector double a_hi_lo[2]; vector double b_hi_lo[2]; vector double r_hi_lo[2]; union vec_alias result_hi, result_lo; for (i = 0; i < n_vp; i++) { __builtin_vsx_disassemble_pair (&a_hi_lo[0], &vp_a[i]); __builtin_vsx_disassemble_pair (&b_hi_lo[0], &vp_b[i]); __builtin_vsx_disassemble_pair (&r_hi_lo[0], &vp_r[i]); result_hi.vd = __builtin_vsx_xvmadddp (a_hi_lo[0], b_hi_lo[0], r_hi_lo[0]); result_lo.vd = __builtin_vsx_xvmadddp (a_hi_lo[1], b_hi_lo[1], r_hi_lo[1]); __builtin_vsx_build_pair (&vp_r[i], result_hi.vuc, result_lo.vuc); } for (i = n_vp * num_elements; i < n; i++) r[i] = fma (a[i], b[i], r[i]); } The inner loop would look like: .L128: lxvpx 10,4,2 lxvpx 0,5,2 lxvpx 12,3,2 xxlor 9,10,10 xxlor 10,11,11 xxlor 11,1,1 xvmaddmdp 0,9,12 xvmaddmdp 11,10,13 xxlor 12,0,0 xxlor 13,11,11 stxvpx 12,3,2 addi 2,2,32 bdnz .L128 I.e. there are now 3 extra vector moves after the load vector pair instruction, and 2 vector moves to combine the vector back into a vector pair. If you use an explicit #pragma GCC unroll 2, the code generated would be: .L153: lxvpx 10,3,2 addi 9,2,32 lxvpx 6,4,2 lxvpx 8,5,2 lxvpx 12,5,9 lxvpx 0,4,9 xxlor 3,11,11 xxlor 5,6,6 xxlor 6,7,7 xxlor 7,9,9 xxlor 11,12,12 xxlor 12,3,3 xvmaddadp 10,5,8 xxlor 9,13,13 xvmaddadp 12,7,6 xxlor 6,10,10 xxlor 7,12,12 stxvpx 6,3,2 addi 2,2,64 lxvpx 12,3,9 xxlor 10,12,12 xxlor 12,13,13 xvmaddmdp 0,11,10 xvmaddadp 12,9,1 xxlor 10,0,0 xxlor 11,12,12 stxvpx 10,3,9 bdnz .L153 Finally, if you recode the loop to use the vpair_f64_fma function in this patch, the code would look like: #include <altivec.h> #include <math.h> #include <vector-pair.h> #include <stddef.h> void fma_vpair (double * __restrict__ r, const double * __restrict__ a, const double * __restrict__ b, size_t n) { vector_pair_f64_t * __restrict__ vp_r = (vector_pair_f64_t * __restrict__)r; const vector_pair_f64_t * __restrict__ vp_a = (const vector_pair_f64_t * __restrict__)a; const vector_pair_f64_t * __restrict__ vp_b = (const vector_pair_f64_t * __restrict__)b; size_t num_elements = (sizeof (vector_pair_f64_t) / sizeof (double)); size_t n_vp = n / num_elements; size_t i; for (i = 0; i < n_vp; i++) vpair_f64_fma (&vp_r[i], &vp_a[i], &vp_b[i], &vp_r[i]); for (i = n_vp * num_elements; i < n; i++) r[i] = fma (a[i], b[i], r[i]); } The inner loop would generate: .L184: addi 9,2,32 lxvpx 0,3,2 lxvpx 8,4,2 lxvpx 6,5,2 lxvpx 12,4,9 lxvpx 10,5,9 #APP # 437 "./include/vector-pair.h" 1 xvmaddadp 0,8,6 xvmaddadp 0+1,8+1,6+1 # 0 "" 2 #NO_APP stxvpx 0,3,2 addi 2,2,64 lxvpx 0,3,9 #APP # 437 "./include/vector-pair.h" 1 xvmaddadp 0,12,10 xvmaddadp 0+1,12+1,10+1 # 0 "" 2 #NO_APP stxvpx 0,3,9 bdnz .L184 I.e. there are no extra vector moves in this loop, and normal unrolling does duplicate this loop. The vector-pair.h include file provides support if the code is compiled on previous VSX systems that don't have the vector pair load/store instructions. This allows the library writer to use the same code on both power9 and power10 systems, without have to use #ifdef operations. On a power9, the code generated would be: .L66: lxvx 0,3,9 lxvx 12,4,9 lxvx 11,5,9 xvmaddadp 0,12,11 lxvx 12,7,9 lxvx 11,8,9 stxvx 0,3,9 lxvx 0,10,9 xvmaddadp 0,12,11 stxvx 0,10,9 addi 9,9,32 bdnz .L66 With an explicit #pragma GCC unroll 2, the code generated would be: .L93: lxvx 0,3,9 lxvx 12,4,9 addi 10,9,32 lxvx 11,5,9 xvmaddadp 0,12,11 lxvx 12,7,9 lxvx 11,11,9 stxvx 0,3,9 lxvx 0,8,9 xvmaddadp 0,12,11 lxvx 12,4,10 lxvx 11,5,10 stxvx 0,8,9 addi 9,9,64 lxvx 0,3,10 xvmaddadp 0,12,11 lxvx 12,7,10 lxvx 11,11,10 stxvx 0,3,10 lxvx 0,8,10 xvmaddadp 0,12,11 stxvx 0,8,10 bdnz .L93 2024-12-02 Michael Meissner <meiss...@linux.ibm.com> gcc/ * config.gcc (powerpc*-*-*): Add vector-pair.h to extra headers. * config/rs6000/vector-pair.h: New file. * doc/extend.texi (PowerPC Vector Pair Support): Document the vector pair support functions. gcc/testsuite/ * gcc.target/powerpc/vpair-1.c: New test or include file. * gcc.target/powerpc/vpair-2.c: Likewise. * gcc.target/powerpc/vpair-3-not-p10.c: Likewise. * gcc.target/powerpc/vpair-3-p10.c: Likewise. * gcc.target/powerpc/vpair-3.h: Likewise. * gcc.target/powerpc/vpair-4-not-p10.c: Likewise. * gcc.target/powerpc/vpair-4-p10.c: Likewise. * gcc.target/powerpc/vpair-4.h: Likewise. --- gcc/config.gcc | 2 +- gcc/config/rs6000/rs6000-c.cc | 8 +- gcc/config/rs6000/vector-pair.h | 571 ++++++++++++++++++ gcc/doc/extend.texi | 100 +++ gcc/testsuite/gcc.target/powerpc/vpair-1.c | 150 +++++ gcc/testsuite/gcc.target/powerpc/vpair-2.c | 150 +++++ .../gcc.target/powerpc/vpair-3-not-p10.c | 15 + .../gcc.target/powerpc/vpair-3-p10.c | 14 + gcc/testsuite/gcc.target/powerpc/vpair-3.h | 461 ++++++++++++++ .../gcc.target/powerpc/vpair-4-not-p10.c | 15 + .../gcc.target/powerpc/vpair-4-p10.c | 14 + gcc/testsuite/gcc.target/powerpc/vpair-4.h | 461 ++++++++++++++ 12 files changed, 1958 insertions(+), 3 deletions(-) create mode 100644 gcc/config/rs6000/vector-pair.h create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-1.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-2.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-3.h create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-4.h diff --git a/gcc/config.gcc b/gcc/config.gcc index 6c1cd665ab2..b811ae005ef 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -534,7 +534,7 @@ powerpc*-*-*) extra_headers="${extra_headers} pmmintrin.h tmmintrin.h smmintrin.h" extra_headers="${extra_headers} nmmintrin.h immintrin.h x86gprintrin.h" extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h" - extra_headers="${extra_headers} amo.h" + extra_headers="${extra_headers} amo.h vector-pair.h" case x$with_cpu in xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower1[01]|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500|xfuture) cpu_is_64bit=yes diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc index 6863e34fa70..c13b256ca51 100644 --- a/gcc/config/rs6000/rs6000-c.cc +++ b/gcc/config/rs6000/rs6000-c.cc @@ -590,9 +590,13 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT flags, if (rs6000_cpu == PROCESSOR_CELL) rs6000_define_or_undefine_macro (define_p, "__PPU__"); - /* Tell the user if we support the MMA instructions. */ + /* Tell the user if we support the MMA instructions. Also tell vector-pair.h + that we have the vector pair built-in function support. */ if ((flags & OPTION_MASK_MMA) != 0) - rs6000_define_or_undefine_macro (define_p, "__MMA__"); + { + rs6000_define_or_undefine_macro (define_p, "__MMA__"); + rs6000_define_or_undefine_macro (define_p, "__VPAIR__"); + } /* Whether pc-relative code is being generated. */ if ((flags & OPTION_MASK_PCREL) != 0) rs6000_define_or_undefine_macro (define_p, "__PCREL__"); diff --git a/gcc/config/rs6000/vector-pair.h b/gcc/config/rs6000/vector-pair.h new file mode 100644 index 00000000000..575c88f0ecf --- /dev/null +++ b/gcc/config/rs6000/vector-pair.h @@ -0,0 +1,571 @@ +/* PowerPC vector pair include file. + Copyright (C) 2024 Free Software Foundation, Inc. + Contributed by Aldy Hernandez (al...@redhat.com). + Rewritten by Paolo Bonzini (bonz...@gnu.org). + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* Provide support for vector pairs, even on systems that do not have native + support for loading and storing pairs of vectors. */ + +#ifndef _VECTOR_PAIR_H +#define _VECTOR_PAIR_H 1 + +/* Union of the various vector pair types. */ +union __vpair_union { + +#ifdef __MMA__ + __vector_pair __vpair; +#endif + + vector double __vp_f64[2]; + vector float __vp_f32[2]; + vector unsigned char __vp_uc[2]; +}; + +typedef union __vpair_union vector_pair_f64_t; +typedef union __vpair_union vector_pair_f32_t; + +#if !__VPAIR_BUILTIN__ && !__VPAIR_ASM__ && !__VPAIR_NOP10__ +#if __MMA__ +#define __VPAIR_ASM__ 1 + +#else +#define __VPAIR_NOP10__ 1 +#endif +#endif + +/* Macros to simplify creation of the various operations. + * + * The __VPAIR_FP_{UNARY,BINARY,FMA} macros are the base macros, and takes: + * R: The argument for the output vector pair + * A, B, C: 1-3 arguments for the inputs + * OPCODE: The assembler opcode for __asm__ on power10 + * VEC: Either __vp_f64 or __vp_f32 for the union field + * VEC_FUNC: 128-bit vector function for use on power8/power9 + * + * The __VPAIR_FP_splat macro takes: + * R: The argument for the output vector pair + * X: The scalar that is to be splat-ed to the vector pair + * VEC: Either __vp_f64 or __vp_f32 for the union field + * + * The __VPAIR_F32_<...> and __VPAIR_F64_<...> macros call the above macros + * with the appropriate structure field to use. + */ + +#undef __VPAIR_FP_SPLAT +#undef __VPAIR_FP_UNARY +#undef __VPAIR_FP_BINARY +#undef __VPAIR_FP_FMA + +#undef __VPAIR_F64_UNARY +#undef __VPAIR_F64_BINARY +#undef __VPAIR_F64_FMA + +#undef __VPAIR_F32_UNARY +#undef __VPAIR_F32_BINARY +#undef __VPAIR_F32_FMA + +/* Operations using a vector pair and __asm__operations. */ +#if __MMA__ && !__VPAIR_NOP10__ + +/* When using __asm__, we need to access the second register. Due to the way + VSX registers were formed by combining the traditional floating point + registers and Altivec registers, we can't use the output modifier %L<n> to + refer to the second register if the VSX register was a traditional Altivec + register. If the value is in VSX registers 34 & 35, %x0 would give 34, but + %L0 would give 1, since 'Altivec' registers start at 0. + + If we have a compiler where we cannot use register+1 to access the second + register in a vector pair, define __VPAIR_USE_FPR__ so that only traditional + floating point registers can be used, and %L<n> will return the 2nd register + name/number. */ + +#if __VPAIR__USE_FPR__ + +/* Use %0 and %L0 on traditional FPR registers. */ +#define __VPAIR_FP_SPLAT(R, X, VEC) \ + __asm__ ("xxlor %L0,%0,%0" \ + : "=d" ((R)->__vpair) \ + : "0" (__builtin_vec_splats ((X)))) + +#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %0,%1\n\t" OPCODE " %L0,%L1" \ + : "=d" ((R)->__vpair) \ + : "d" ((A)->__vpair)) + +#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %0,%1,$1\n\t" OPCODE " %L0,%L1,%L2" \ + : "=d" ((R)->__vpair) \ + : "d" ((A)->__vpair), "d" ((B)->__vpair)) + +/* Note the 'a' form of the fma instructions must be used. */ +#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %0,%1,%2\n\t" OPCODE " %L0,%L1,%L2" \ + : "=d" ((R)->__vpair) \ + : "d" ((A)->__vpair), "d" ((B)->__vpair), "0" ((C)->__vpair)) + +#else + +/* Use %x0 and %x0+1 on VSX reigsters. */ +#define __VPAIR_FP_SPLAT(R, X, VEC) \ + __asm__ ("xxlor %x0+1,%x0,%x0" \ + : "=wa" ((R)->__vpair) \ + : "0" (__builtin_vec_splats ((X)))) + +#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %x0,%x1\n\t" OPCODE " %x0+1,%x1+1" \ + : "=wa" ((R)->__vpair) \ + : "wa" ((A)->__vpair)) + +#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \ + : "=wa" ((R)->__vpair) \ + : "wa" ((A)->__vpair), "wa" ((B)->__vpair)) + +/* Note the 'a' form of the fma instructions must be used. */ +#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \ + : "=wa" ((R)->__vpair) \ + : "wa" ((A)->__vpair), "wa" ((B)->__vpair), "0" ((C)->__vpair)) +#endif /* Select whether to use %0/%L0 or %x0/%x0+1. */ + +#else /* vpair support on power8/power9. */ + +/* Pair of vector operations using a built-in function. */ + +#define __VPAIR_FP_SPLAT(R, X, VEC) \ + (R)->VEC[0] = (R)->VEC[1] = __builtin_vec_splats ((X)) + +#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC) \ + do \ + { \ + (R)->VEC[0] = VEC_FUNC ((A)->VEC[0]); \ + (R)->VEC[1] = VEC_FUNC ((A)->VEC[1]); \ + } \ + while (0) + +#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC) \ + do \ + { \ + (R)->VEC[0] = VEC_FUNC ((A)->VEC[0], (B)->VEC[0]); \ + (R)->VEC[1] = VEC_FUNC ((A)->VEC[1], (B)->VEC[1]); \ + } \ + while (0) + +#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC) \ + do \ + { \ + (R)->VEC[0] = VEC_FUNC ((A)->VEC[0], (B)->VEC[0], (C)->VEC[0]); \ + (R)->VEC[1] = VEC_FUNC ((A)->VEC[1], (B)->VEC[1], (C)->VEC[1]); \ + } \ + while (0) + +#endif /* Vector pair support. */ + +/* 64-bit version of the macros. */ +#define __VPAIR_F64_UNARY(R, A, OPCODE, VEC_FUNC) \ + __VPAIR_FP_UNARY(R, A, OPCODE, __vp_f64, VEC_FUNC) + +#define __VPAIR_F64_BINARY(R, A, B, OPCODE, VEC_FUNC) \ + __VPAIR_FP_BINARY(R, A, B, OPCODE, __vp_f64, VEC_FUNC) + +#define __VPAIR_F64_FMA(R, A, B, C, OPCODE, VEC_FUNC) \ + __VPAIR_FP_FMA(R, A, B, C, OPCODE, __vp_f64, VEC_FUNC) + + +/* 32-bit version of the macros. */ +#define __VPAIR_F32_UNARY(R, A, OPCODE, VEC_FUNC) \ + __VPAIR_FP_UNARY(R, A, OPCODE, __vp_f32, VEC_FUNC) + +#define __VPAIR_F32_BINARY(R, A, B, OPCODE, VEC_FUNC) \ + __VPAIR_FP_BINARY(R, A, B, OPCODE, __vp_f32, VEC_FUNC) + +#define __VPAIR_F32_FMA(R, A, B, C, OPCODE, VEC_FUNC) \ + __VPAIR_FP_FMA(R, A, B, C, OPCODE, __vp_f32, VEC_FUNC) + + +/* Splat functions. */ + +/* 64-bit splat to vector pair. */ + +static inline void +vpair_f64_splat (vector_pair_f64_t *__r, double __x) +{ + __VPAIR_FP_SPLAT (__r, __x, __vp_f64); +} + +/* 32-bit splat to vector pair. */ + +static inline void +vpair_f32_splat (vector_pair_f32_t *__r, float __x) +{ + __VPAIR_FP_SPLAT (__r, __x, __vp_f32); +} + + +/* 64-bit unary functions. */ + +static inline void +vpair_f64_abs (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a) +{ + __VPAIR_F64_UNARY (__r, __a, + "xvabsdp", + __builtin_vec_abs); +} + +static inline void +vpair_f64_nabs (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a) +{ + __VPAIR_F64_UNARY (__r, __a, + "xvnabsdp", + __builtin_vec_nabs); +} + +static inline void +vpair_f64_neg (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a) +{ + __VPAIR_F64_UNARY (__r, __a, + "xvnegdp", + __builtin_vec_neg); +} + +static inline void +vpair_f64_sqrt (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a) +{ + __VPAIR_F64_UNARY (__r, __a, + "xvsqrtdp", + __builtin_vec_sqrt); +} + +/* 32-bit unary functions. */ + +static inline void +vpair_f32_abs (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a) +{ + __VPAIR_F32_UNARY (__r, __a, + "xvabssp", + __builtin_vec_abs); +} + +static inline void +vpair_f32_nabs (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a) +{ + __VPAIR_F32_UNARY (__r, __a, + "xvnabssp", + __builtin_vec_nabs); +} + +static inline void +vpair_f32_neg (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a) +{ + __VPAIR_F32_UNARY (__r, __a, + "xvnegsp", + __builtin_vec_neg); +} + +static inline void +vpair_f32_sqrt (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a) +{ + __VPAIR_F32_UNARY (__r, __a, + "xvsqrtsp", + __builtin_vec_sqrt); +} + + +/* 64-bit binary functions. */ + +static inline void +vpair_f64_add (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvadddp", + __builtin_vec_add); +} + +static inline void +vpair_f64_div (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvdivdp", + __builtin_vec_div); +} + +static inline void +vpair_f64_max (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvmaxdp", + __builtin_vec_max); +} + +static inline void +vpair_f64_min (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvmindp", + __builtin_vec_min); +} + +static inline void +vpair_f64_mul (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvmuldp", + __builtin_vec_mul); +} + +static inline void +vpair_f64_sub (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvsubdp", + __builtin_vec_sub); +} + +/* 32-bit binary functions. */ + +static inline void +vpair_f32_add (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvaddsp", + __builtin_vec_add); +} + +static inline void +vpair_f32_div (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvdivsp", + __builtin_vec_div); +} + +static inline void +vpair_f32_max (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvmaxsp", + __builtin_vec_max); +} + +static inline void +vpair_f32_min (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvminsp", + __builtin_vec_min); +} + +static inline void +vpair_f32_mul (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvmulsp", + __builtin_vec_mul); +} + +static inline void +vpair_f32_sub (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvsubsp", + __builtin_vec_sub); +} + +/* 64-bit fma operations. */ + +static inline void +vpair_f64_fma (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b, + const vector_pair_f64_t *__c) +{ + __VPAIR_F64_FMA (__r, __a, __b, __c, + "xvmaddadp", + __builtin_vsx_xvmadddp); +} + +static inline void +vpair_f64_fms (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b, + const vector_pair_f64_t *__c) +{ + __VPAIR_F64_FMA (__r, __a, __b, __c, + "xvmsubadp", + __builtin_vsx_xvmsubdp); +} + +static inline void +vpair_f64_nfma (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b, + const vector_pair_f64_t *__c) +{ + __VPAIR_F64_FMA (__r, __a, __b, __c, + "xvnmaddadp", + __builtin_vsx_xvnmadddp); +} + +static inline void +vpair_f64_nfms (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b, + const vector_pair_f64_t *__c) +{ + __VPAIR_F64_FMA (__r, __a, __b, __c, + "xvnmsubadp", + __builtin_vsx_xvnmsubdp); +} +/* 32-bit fma operations. */ + +static inline void +vpair_f32_fma (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b, + const vector_pair_f32_t *__c) +{ + __VPAIR_F32_FMA (__r, __a, __b, __c, + "xvmaddasp", + __builtin_vsx_xvmaddsp); +} + +static inline void +vpair_f32_fms (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b, + const vector_pair_f32_t *__c) +{ + __VPAIR_F32_FMA (__r, __a, __b, __c, + "xvmsubasp", + __builtin_vsx_xvmsubsp); +} + +static inline void +vpair_f32_nfma (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b, + const vector_pair_f32_t *__c) +{ + __VPAIR_F32_FMA (__r, __a, __b, __c, + "xvnmaddasp", + __builtin_vsx_xvnmaddsp); +} + +static inline void +vpair_f32_nfms (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b, + const vector_pair_f32_t *__c) +{ + __VPAIR_F32_FMA (__r, __a, __b, __c, + "xvnmsubasp", + __builtin_vsx_xvnmsubsp); +} + + +/* Swap even/odd operations. */ + +static inline void +vpair_f32_swap_odd_even (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a) +{ + vector unsigned long long __rotate = { 32, 32 }; + +#if __MMA__ && !__VPAIR_NOP10__ + /* Power10 vector pair support. */ + __asm__ ("vrld %0,%1,%2\n\tvrld %L0,%L1,%2" + : "=v" (__r->__vpair) + : "v" (__a->__vpair), "v" (__rotate)); + +#else /* vector pair not available. */ + vector unsigned long long *__r_ll = (vector unsigned long long *)__r; + vector unsigned long long *__a_ll = (vector unsigned long long *)__a; + __r_ll[0] = __builtin_vec_vrld (__a_ll[0], __rotate); + __r_ll[1] = __builtin_vec_vrld (__a_ll[1], __rotate); +#endif /* Vector pair support. */ +} + + +static inline void +vpair_f64_swap_odd_even (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a) +{ +#if __MMA__ && !__VPAIR_NOP10__ +#if __VPAIR__USE_FPR__ + + /* Use vector pair and use %0 and %L0 on traditional FPR registers. */ + __asm__ ("xxpermdi %0,%1,%1,2\n\txxpermdi %L0,%L1,%L1,2" + : "=d" (__r->__vpair) + : "d" (__a->__vpair)); + +#else /* !__VPAIR__USE_FPR__. */ + /* Use vector pair and use %x0 and %x0+ on all VSX registers. */ + __asm__ ("xxpermdi %x0,%x1,%x1,2\n\txxpermdi %x0+1,%x1+1,%x1+1,2" + : "=wa" (__r->__vpair) + : "wa" (__a->__vpair)); + +#endif /* !__VPAIR__USE_FPR__. */ + +#else /* vector pair not available. */ + __r->__vp_f64[0] + = __builtin_vsx_xxpermdi_2df (__a->__vp_f64[0], __a->__vp_f64[0], 2); + __r->__vp_f64[1] + = __builtin_vsx_xxpermdi_2df (__a->__vp_f64[1], __a->__vp_f64[1], 2); + +#endif /* Vector pair support. */ +} + +#endif /* _VECTOR_PAIR_H. */ diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index c55991df8f1..f319a2dbc1a 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -16427,6 +16427,7 @@ instructions, but allow the compiler to schedule those calls. * PowerPC Hardware Transactional Memory Built-in Functions:: * PowerPC Atomic Memory Operation Functions:: * PowerPC Matrix-Multiply Assist Built-in Functions:: +* PowerPC Vector Pair Support:: * PRU Built-in Functions:: * RISC-V Built-in Functions:: * RISC-V Vector Intrinsics:: @@ -24830,6 +24831,105 @@ __vector_pair __builtin_vsx_lxvp (size_t, __vector_pair *); void __builtin_vsx_stxvp (__vector_pair, size_t, __vector_pair *); @end smallexample +@node PowerPC Vector Pair Support +@subsection PowerPC Vector Pair Support +ISA 3.1 (power10) added instructions to load and store pairs of +vectors with a single instruction. + +GCC now provides an include file (@file{vector-pair.h}) on PowerPC +systems that allows users to write code that can write 32-bit and +64-bit floating point code that processes data in 256-bit chunks +rather than 128-bit chunks. + +If the code is compiled on an ISA 3.1 system with MMA enabled, the +vector pair functions will use the @code{__vector_pair} type to have +values in adjacent vectors and do the operation as a pair of +operations. + +If the code is compiled on a VSX system, but not one with MMA enabled, the vector +pair functions will use 2 separate vectors to do the operation. + +Two types are provided: @code{vector_pair_f64_t} is for vector pairs +that will operate on units of 4 64-bit floating point values, and +@code{vector_pair_f32_t} for operating on units of 8 32-bit floating +point values. + +The following functions are provided for operating on vector pairs +that consist of 4 64-bit floating point values: + +@smallexample +void vpair_f64_splat (vector_pair_f64_t *, double); + +void vpair_f64_abs (vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_nabs (vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_neg (vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_sqrt (vector_pair_f64_t *, vector_pair_f64_t *); + +void vpair_f64_add (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_div (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_max (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_min (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_mul (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_sub (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); + +void vpair_f64_fma (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_fms (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_nfma (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_nfms (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +@end smallexample + +The following functions are provided for operating on vector pairs +that consist of 8 32-bit floating point values: + +@smallexample +void vpair_f32_splat (vector_pair_f32_t *, float); + +void vpair_f32_abs (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_nabs (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_neg (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_sqrt (vector_pair_f32_t *, vector_pair_f32_t *); + +void vpair_f32_add (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_div (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_max (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_min (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_mul (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_sub (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); + +void vpair_f32_fma (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_fms (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_nfma (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_nfms (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +@end smallexample + +The following functions are provided for operating on vector pairs +that swap the even and odd elements. + +@smallexample +void vpair_f32_swap_odd_even (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f64_swap_odd_even (vector_pair_f64_t *, vector_pair_f64_t *); +@end smallexample + @node PRU Built-in Functions @subsection PRU Built-in Functions diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-1.c b/gcc/testsuite/gcc.target/powerpc/vpair-1.c new file mode 100644 index 00000000000..f56e99a1d04 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-1.c @@ -0,0 +1,150 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector builtin code generates the expected instructions for + vector pairs with 4 double elements. */ + +#include <vector-pair.h> + +void +test_add (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvadddp, 1 stxvp. */ + vpair_f64_add (dest, x, y); +} + +void +test_sub (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvsubdp, 1 stxvp. */ + vpair_f64_sub (dest, x, y); +} + +void +test_multiply (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvmuldp, 1 stxvp. */ + vpair_f64_mul (dest, x, y); +} + +void +test_min (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvmindp, 1 stxvp. */ + vpair_f64_min (dest, x, y); +} + +void +test_max (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvmaxdp, 1 stxvp. */ + vpair_f64_max (dest, x, y); +} + +void +test_negate (vector_pair_f64_t *dest, + vector_pair_f64_t *x) +{ + /* 1 lxvp, 2 xvnegdp, 1 stxvp. */ + vpair_f64_neg (dest, x); +} + +void +test_abs (vector_pair_f64_t *dest, + vector_pair_f64_t *x) +{ + /* 1 lxvp, 2 xvabsdp, 1 stxvp. */ + vpair_f64_abs (dest, x); +} + +void +test_negative_abs (vector_pair_f64_t *dest, + vector_pair_f64_t *x) +{ + /* 2 lxvp, 2 xvnabsdp, 1 stxvp. */ + vpair_f64_nabs (dest, x); +} + +void +test_sqrt (vector_pair_f64_t *dest, + vector_pair_f64_t *x) +{ + /* 1 lxvp, 2 xvabsdp, 1 stxvp. */ + vpair_f64_sqrt (dest, x); +} + +void +test_fma (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y, + vector_pair_f64_t *z) +{ + /* 2 lxvp, 2 xvmadd{a,m}dp, 1 stxvp. */ + vpair_f64_fma (dest, x, y, z); +} + +void +test_fms (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y, + vector_pair_f64_t *z) +{ + /* 2 lxvp, 2 xvmsub{a,m}dp, 1 stxvp. */ + vpair_f64_fms (dest, x, y, z); +} + +void +test_nfma (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y, + vector_pair_f64_t *z) +{ + /* 2 lxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */ + vpair_f64_nfma (dest, x, y, z); +} + +void +test_nfms (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y, + vector_pair_f64_t *z) +{ + /* 2 lxvp, 2 xvnmsub{a,m}dp, 1 stxvp. */ + vpair_f64_nfms (dest, x, y, z); +} + +void +test_swap (vector_pair_f64_t *dest, + vector_pair_f64_t *x) +{ + /* 1 lxvp, 2 xxpermdi, 1 stxvp. */ + vpair_f64_swap_odd_even (dest, x); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 27 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 14 } } */ +/* { dg-final { scan-assembler-times {\mxvabsdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvadddp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmaxdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmindp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmuldp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnabsdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnegdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsqrtdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsubdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-2.c b/gcc/testsuite/gcc.target/powerpc/vpair-2.c new file mode 100644 index 00000000000..0f84a74e94a --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-2.c @@ -0,0 +1,150 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector builtin code generates the expected instructions for + vector pairs with 4 double elements. */ + +#include <vector-pair.h> + +void +test_add (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvaddsp, 1 stxvp. */ + vpair_f32_add (dest, x, y); +} + +void +test_sub (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvsubsp, 1 stxvp. */ + vpair_f32_sub (dest, x, y); +} + +void +test_multiply (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvmulsp, 1 stxvp. */ + vpair_f32_mul (dest, x, y); +} + +void +test_min (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvminsp, 1 stxvp. */ + vpair_f32_min (dest, x, y); +} + +void +test_max (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvmaxsp, 1 stxvp. */ + vpair_f32_max (dest, x, y); +} + +void +test_negate (vector_pair_f32_t *dest, + vector_pair_f32_t *x) +{ + /* 1 lxvp, 2 xvnegsp, 1 stxvp. */ + vpair_f32_neg (dest, x); +} + +void +test_abs (vector_pair_f32_t *dest, + vector_pair_f32_t *x) +{ + /* 1 lxvp, 2 xvabssp, 1 stxvp. */ + vpair_f32_abs (dest, x); +} + +void +test_negative_abs (vector_pair_f32_t *dest, + vector_pair_f32_t *x) +{ + /* 2 lxvp, 2 xvnabssp, 1 stxvp. */ + vpair_f32_nabs (dest, x); +} + +void +test_sqrt (vector_pair_f32_t *dest, + vector_pair_f32_t *x) +{ + /* 1 lxvp, 2 xvabssp, 1 stxvp. */ + vpair_f32_sqrt (dest, x); +} + +void +test_fma (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y, + vector_pair_f32_t *z) +{ + /* 2 lxvp, 2 xvmadd{a,m}sp, 1 stxvp. */ + vpair_f32_fma (dest, x, y, z); +} + +void +test_fms (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y, + vector_pair_f32_t *z) +{ + /* 2 lxvp, 2 xvmsub{a,m}sp, 1 stxvp. */ + vpair_f32_fms (dest, x, y, z); +} + +void +test_nfma (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y, + vector_pair_f32_t *z) +{ + /* 2 lxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */ + vpair_f32_nfma (dest, x, y, z); +} + +void +test_nfms (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y, + vector_pair_f32_t *z) +{ + /* 2 lxvp, 2 xvnmsub{a,m}sp, 1 stxvp. */ + vpair_f32_nfms (dest, x, y, z); +} + +void +test_swap (vector_pair_f32_t *dest, + vector_pair_f32_t *x) +{ + /* 1 lxvp, 2 xxpermdi, 1 stxvp. */ + vpair_f32_swap_odd_even (dest, x); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 27 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 14 } } */ +/* { dg-final { scan-assembler-times {\mvrld\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvabssp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvaddsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmaxsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvminsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmulsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnabssp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnegsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsqrtsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsubsp\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c new file mode 100644 index 00000000000..d1a1029417f --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c @@ -0,0 +1,15 @@ +/* { dg-do run { target { vsx_hw } } } */ +/* { dg-options "-mvsx -O2 -ffast-math -mno-mma" } */ + +/* + * This test of the double (f64) vector pair functions in vector-pair.h is run + * on VSX systems when the load/store vector pair instructions are not + * available. + * + * The -ffast-math option is used to just use the hardware sqrt, min, and max + * instructions without calling into the library. + * + * The -mno-mma option disables GCC from enabling the __vector_pair type. + */ + +#include "vpair-3.h" diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c new file mode 100644 index 00000000000..d78faf3fed4 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c @@ -0,0 +1,14 @@ +/* { dg-do run { target { power10_hw } } } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ffast-math -mmma" } */ + +/* + * This test of the double (f64) vector pair functions in vector-pair.h is run + * on VSX systems when the load/store vector pair instructions are available. + * + * The -ffast-math option is used to just use the hardware sqrt, min, and max + * instructions without calling into the library. + * + * The -mmma option makes sure GC enables the __vector_pair type. + */ + +#include "vpair-3.h" diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-3.h b/gcc/testsuite/gcc.target/powerpc/vpair-3.h new file mode 100644 index 00000000000..656488dbb62 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-3.h @@ -0,0 +1,461 @@ +/* Common include file to test the vector pair double functions. This is run + two times, once compiled for a non-power10 system that does not have the + vector pair load and store instructions, and once with power10 defaults that + has load/store vector pair. */ + +#include <stddef.h> +#include <stdlib.h> +#include <vector-pair.h> + +#ifdef DEBUG +#include <stdio.h> +#endif + +#ifndef NUM +#define NUM 16 +#endif + +static double result1[NUM]; +static double result2[NUM]; +static double in_a[NUM]; +static double in_b[NUM]; +static double in_c[NUM]; + +/* vector pair tests. */ + +void +vpair_abs (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_abs (vr + i, va + i); +} + +void +vpair_nabs (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_nabs (vr + i, va + i); +} + +void +vpair_neg (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_neg (vr + i, va + i); +} + +void +vpair_sqrt (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_sqrt (vr + i, va + i); +} + +void +vpair_add (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_add (vr + i, va + i, vb + i); +} + +void +vpair_sub (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_sub (vr + i, va + i, vb + i); +} + +void +vpair_mul (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_mul (vr + i, va + i, vb + i); +} + +void +vpair_div (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_div (vr + i, va + i, vb + i); +} + +void +vpair_min (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_min (vr + i, va + i, vb + i); +} + +void +vpair_max (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_max (vr + i, va + i, vb + i); +} + +void +vpair_fma (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + vector_pair_f64_t *vc = (vector_pair_f64_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_fma (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_fms (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + vector_pair_f64_t *vc = (vector_pair_f64_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_fms (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_nfma (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + vector_pair_f64_t *vc = (vector_pair_f64_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_nfma (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_nfms (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + vector_pair_f64_t *vc = (vector_pair_f64_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_nfms (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_swap (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_swap_odd_even (vr + i, va + i); +} + + +/* scalar tests. */ + +void +scalar_abs (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < 0.0) ? -a[i] : a[i]; +} + +void +scalar_nabs (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < 0.0) ? a[i] : -a[i]; +} + +void +scalar_neg (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = -a[i]; +} + +void +scalar_sqrt (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_sqrt (a[i]); +} + +void +scalar_add (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] + b[i]; +} + +void +scalar_sub (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] - b[i]; +} + +void +scalar_mul (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] * b[i]; +} + +void +scalar_div (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] / b[i]; +} + +void +scalar_min (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < b[i]) ? a[i] : b[i]; +} + +void +scalar_max (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] > b[i]) ? a[i] : b[i]; +} + +void +scalar_fma (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_fma (a[i], b[i], c[i]); +} + +void +scalar_fms (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_fma (a[i], b[i], -c[i]); +} + +void +scalar_nfma (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = - __builtin_fma (a[i], b[i], c[i]); +} + +void +scalar_nfms (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = - __builtin_fma (a[i], b[i], -c[i]); +} + +void +scalar_swap (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i += 2) + { + r[i] = a[i+1]; + r[i+1] = a[i]; + } +} + + +/* Check results. */ +void +check (const char *name) +{ + size_t i; + + for (i = 0; i < NUM; i++) + if (result1[i] != result2[i]) + { +#ifdef DEBUG + printf ("test #%ld failed, %g != %g, %s (%g, %g, %g).\n", + (long)i, + result1[i], + result2[i], + name, + in_a[i], + in_b[i], + in_c[i]); +#endif + abort (); + } + + return; +} + +typedef void func_t (double *, double *, double *, double *, size_t); + +/* tests to run. */ +struct +{ + func_t *vpair_test; + func_t *scalar_test; + const char *name; +} tests[] = { + { vpair_abs, scalar_abs, "abs" }, + { vpair_nabs, scalar_nabs, "nabs" }, + { vpair_neg, scalar_neg, "neg" }, + { vpair_sqrt, scalar_sqrt, "sqrt" }, + { vpair_add, scalar_add, "add" }, + { vpair_sub, scalar_sub, "sub" }, + { vpair_mul, scalar_mul, "mul" }, + { vpair_div, scalar_div, "div" }, + { vpair_min, scalar_min, "min" }, + { vpair_max, scalar_max, "max" }, + { vpair_fma, scalar_fma, "fma" }, + { vpair_fms, scalar_fms, "fms" }, + { vpair_nfma, scalar_nfma, "nfma" }, + { vpair_nfms, scalar_nfms, "nfms" }, + { vpair_swap, scalar_swap, "swap" }, +}; + +/* Run tests. */ + +int +main (void) +{ + size_t i; + + /* Initialize the inputs. */ + for (i = 0; i < NUM; i++) + { + double d = (double)(i + 1); + in_a[i] = d * d; + in_b[i] = d; + in_c[i] = d + 2.0; + } + +#ifdef DEBUG + printf ("Start tests\n"); +#endif + + /* Run the tests. */ + for (i = 0; i < sizeof (tests) / sizeof (tests[0]); i++) + { + tests[i].vpair_test (result1, in_a, in_b, in_c, NUM); + tests[i].scalar_test (result2, in_a, in_b, in_c, NUM); + check (tests[i].name); + } + +#ifdef DEBUG + printf ("End tests\n"); +#endif + + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c new file mode 100644 index 00000000000..f57fbbf8b05 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c @@ -0,0 +1,15 @@ +/* { dg-do run { target { vsx_hw } } } */ +/* { dg-options "-mvsx -O2 -ffast-math -mno-mma" } */ + +/* + * This test of the float (f32) vector pair functions in vector-pair.h is run + * on VSX systems when the load/store vector pair instructions are not + * available. + * + * The -ffast-math option is used to just use the hardware sqrt, min, and max + * instructions without calling into the library. + * + * The -mno-mma option disables GCC from enabling the __vector_pair type. + */ + +#include "vpair-4.h" diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c new file mode 100644 index 00000000000..12291202c16 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c @@ -0,0 +1,14 @@ +/* { dg-do run { target { power10_hw } } } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ffast-math -mmma" } */ + +/* + * This test of the float (f32) vector pair functions in vector-pair.h is run + * on VSX systems when the load/store vector pair instructions are available. + * + * The -ffast-math option is used to just use the hardware sqrt, min, and max + * instructions without calling into the library. + * + * The -mmma option makes sure GC enables the __vector_pair type. + */ + +#include "vpair-4.h" diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-4.h b/gcc/testsuite/gcc.target/powerpc/vpair-4.h new file mode 100644 index 00000000000..49384e27974 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-4.h @@ -0,0 +1,461 @@ +/* Common include file to test the vector pair float functions. This is run + two times, once compiled for a non-power10 system that does not have the + vector pair load and store instructions, and once with power10 defaults that + has load/store vector pair. */ + +#include <stddef.h> +#include <stdlib.h> +#include <vector-pair.h> + +#ifdef DEBUG +#include <stdio.h> +#endif + +#ifndef NUM +#define NUM 16 +#endif + +static float result1[NUM]; +static float result2[NUM]; +static float in_a[NUM]; +static float in_b[NUM]; +static float in_c[NUM]; + +/* vector pair tests. */ + +void +vpair_abs (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_abs (vr + i, va + i); +} + +void +vpair_nabs (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_nabs (vr + i, va + i); +} + +void +vpair_neg (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_neg (vr + i, va + i); +} + +void +vpair_sqrt (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_sqrt (vr + i, va + i); +} + +void +vpair_add (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_add (vr + i, va + i, vb + i); +} + +void +vpair_sub (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_sub (vr + i, va + i, vb + i); +} + +void +vpair_mul (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_mul (vr + i, va + i, vb + i); +} + +void +vpair_div (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_div (vr + i, va + i, vb + i); +} + +void +vpair_min (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_min (vr + i, va + i, vb + i); +} + +void +vpair_max (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_max (vr + i, va + i, vb + i); +} + +void +vpair_fma (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + vector_pair_f32_t *vc = (vector_pair_f32_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_fma (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_fms (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + vector_pair_f32_t *vc = (vector_pair_f32_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_fms (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_nfma (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + vector_pair_f32_t *vc = (vector_pair_f32_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_nfma (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_nfms (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + vector_pair_f32_t *vc = (vector_pair_f32_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_nfms (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_swap (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_swap_odd_even (vr + i, va + i); +} + + +/* scalar tests. */ + +void +scalar_abs (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < 0.0) ? -a[i] : a[i]; +} + +void +scalar_nabs (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < 0.0) ? a[i] : -a[i]; +} + +void +scalar_neg (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = -a[i]; +} + +void +scalar_sqrt (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_sqrt (a[i]); +} + +void +scalar_add (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] + b[i]; +} + +void +scalar_sub (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] - b[i]; +} + +void +scalar_mul (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] * b[i]; +} + +void +scalar_div (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] / b[i]; +} + +void +scalar_min (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < b[i]) ? a[i] : b[i]; +} + +void +scalar_max (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] > b[i]) ? a[i] : b[i]; +} + +void +scalar_fma (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_fma (a[i], b[i], c[i]); +} + +void +scalar_fms (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_fma (a[i], b[i], -c[i]); +} + +void +scalar_nfma (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = - __builtin_fma (a[i], b[i], c[i]); +} + +void +scalar_nfms (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = - __builtin_fma (a[i], b[i], -c[i]); +} + +void +scalar_swap (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i += 2) + { + r[i] = a[i+1]; + r[i+1] = a[i]; + } +} + + +/* Check results. */ +void +check (const char *name) +{ + size_t i; + + for (i = 0; i < NUM; i++) + if (result1[i] != result2[i]) + { +#ifdef DEBUG + printf ("test #%ld failed, %g != %g, %s (%g, %g, %g).\n", + (long)i, + result1[i], + result2[i], + name, + in_a[i], + in_b[i], + in_c[i]); +#endif + abort (); + } + + return; +} + +typedef void func_t (float *, float *, float *, float *, size_t); + +/* tests to run. */ +struct +{ + func_t *vpair_test; + func_t *scalar_test; + const char *name; +} tests[] = { + { vpair_abs, scalar_abs, "abs" }, + { vpair_nabs, scalar_nabs, "nabs" }, + { vpair_neg, scalar_neg, "neg" }, + { vpair_sqrt, scalar_sqrt, "sqrt" }, + { vpair_add, scalar_add, "add" }, + { vpair_sub, scalar_sub, "sub" }, + { vpair_mul, scalar_mul, "mul" }, + { vpair_div, scalar_div, "div" }, + { vpair_min, scalar_min, "min" }, + { vpair_max, scalar_max, "max" }, + { vpair_fma, scalar_fma, "fma" }, + { vpair_fms, scalar_fms, "fms" }, + { vpair_nfma, scalar_nfma, "nfma" }, + { vpair_nfms, scalar_nfms, "nfms" }, + { vpair_swap, scalar_swap, "swap" }, +}; + +/* Run tests. */ + +int +main (void) +{ + size_t i; + + /* Initialize the inputs. */ + for (i = 0; i < NUM; i++) + { + float f = (float)(i + 1); + in_a[i] = f * f; + in_b[i] = f; + in_c[i] = f + 2.0f; + } + +#ifdef DEBUG + printf ("Start tests\n"); +#endif + + /* Run the tests. */ + for (i = 0; i < sizeof (tests) / sizeof (tests[0]); i++) + { + tests[i].vpair_test (result1, in_a, in_b, in_c, NUM); + tests[i].scalar_test (result2, in_a, in_b, in_c, NUM); + check (tests[i].name); + } + +#ifdef DEBUG + printf ("End tests\n"); +#endif + + return 0; +} -- 2.47.0 -- Michael Meissner, IBM PO Box 98, Ayer, Massachusetts, USA, 01432 email: meiss...@linux.ibm.com