See the previous post for a longer explanation of the motavations for this patch:
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/664694.html This patch adds a new include file (vector-pair.h) that implements a series of functions that allows people implementing high performance libraries to optimize their code to use the vector pair load/store instructions on power10 computers to enhance the memory bandwidth. I have tested this on both big endian and little endian servers. Can I check this into the GCC trunk? 2024-10-07 Michael Meissner <meiss...@linux.ibm.com> gcc/ * config.gcc (powerpc*-*-*): Add vector-pair.h to extra headers. * config/rs6000/vector-pair.h: New file. * doc/extend.texi (PowerPC Vector Pair Support): Document the vector pair support functions. gcc/testsuite/ * gcc.target/powerpc/vpair-1.c: New test or include file. * gcc.target/powerpc/vpair-2.c: Likewise. * gcc.target/powerpc/vpair-3-not-p10.c: Likewise. * gcc.target/powerpc/vpair-3-p10.c: Likewise. * gcc.target/powerpc/vpair-3.h: Likewise. * gcc.target/powerpc/vpair-4-not-p10.c: Likewise. * gcc.target/powerpc/vpair-4-p10.c: Likewise. * gcc.target/powerpc/vpair-4.h: Likewise. --- gcc/config.gcc | 2 +- gcc/config/rs6000/rs6000-c.cc | 8 +- gcc/config/rs6000/vector-pair.h | 519 ++++++++++++++++++ gcc/doc/extend.texi | 98 ++++ gcc/testsuite/gcc.target/powerpc/vpair-1.c | 141 +++++ gcc/testsuite/gcc.target/powerpc/vpair-2.c | 141 +++++ .../gcc.target/powerpc/vpair-3-not-p10.c | 15 + .../gcc.target/powerpc/vpair-3-p10.c | 14 + gcc/testsuite/gcc.target/powerpc/vpair-3.h | 435 +++++++++++++++ .../gcc.target/powerpc/vpair-4-not-p10.c | 15 + .../gcc.target/powerpc/vpair-4-p10.c | 14 + gcc/testsuite/gcc.target/powerpc/vpair-4.h | 435 +++++++++++++++ 12 files changed, 1834 insertions(+), 3 deletions(-) create mode 100644 gcc/config/rs6000/vector-pair.h create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-1.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-2.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-3.h create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-4.h diff --git a/gcc/config.gcc b/gcc/config.gcc index 0b794e977f6..3627bed8b86 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -537,7 +537,7 @@ powerpc*-*-*) extra_headers="${extra_headers} pmmintrin.h tmmintrin.h smmintrin.h" extra_headers="${extra_headers} nmmintrin.h immintrin.h x86gprintrin.h" extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h" - extra_headers="${extra_headers} amo.h" + extra_headers="${extra_headers} amo.h vector-pair.h" case x$with_cpu in xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower1[01]|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500|xfuture) cpu_is_64bit=yes diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc index 82826f96a8e..77bee8fc878 100644 --- a/gcc/config/rs6000/rs6000-c.cc +++ b/gcc/config/rs6000/rs6000-c.cc @@ -590,9 +590,13 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT flags, if (rs6000_cpu == PROCESSOR_CELL) rs6000_define_or_undefine_macro (define_p, "__PPU__"); - /* Tell the user if we support the MMA instructions. */ + /* Tell the user if we support the MMA instructions. Also tell vector-pair.h + that we have the vector pair built-in function support. */ if ((flags & OPTION_MASK_MMA) != 0) - rs6000_define_or_undefine_macro (define_p, "__MMA__"); + { + rs6000_define_or_undefine_macro (define_p, "__MMA__"); + rs6000_define_or_undefine_macro (define_p, "__VPAIR__"); + } /* Whether pc-relative code is being generated. */ if ((flags & OPTION_MASK_PCREL) != 0) rs6000_define_or_undefine_macro (define_p, "__PCREL__"); diff --git a/gcc/config/rs6000/vector-pair.h b/gcc/config/rs6000/vector-pair.h new file mode 100644 index 00000000000..ceb28c4e974 --- /dev/null +++ b/gcc/config/rs6000/vector-pair.h @@ -0,0 +1,519 @@ +/* PowerPC vector pair include file. + Copyright (C) 2024 Free Software Foundation, Inc. + Contributed by Aldy Hernandez (al...@redhat.com). + Rewritten by Paolo Bonzini (bonz...@gnu.org). + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* Provide support for vector pairs, even on systems that do not have native + support for loading and storing pairs of vectors. */ + +#ifndef _VECTOR_PAIR_H +#define _VECTOR_PAIR_H 1 + +/* Union of the various vector pair types. */ +union __vpair_union { + +#ifdef __MMA__ + __vector_pair __vpair; +#endif + + vector double __vp_f64[2]; + vector float __vp_f32[2]; + vector unsigned char __vp_uc[2]; +}; + +typedef union __vpair_union vector_pair_f64_t; +typedef union __vpair_union vector_pair_f32_t; + +#if !__VPAIR_BUILTIN__ && !__VPAIR_ASM__ && !__VPAIR_NOP10__ +#if __MMA__ +#define __VPAIR_ASM__ 1 + +#else +#define __VPAIR_NOP10__ 1 +#endif +#endif + +/* Macros to simplify creation of the various operations. + * + * The __VPAIR_FP_{UNARY,BINARY,FMA} macros are the base macros, and takes: + * R: The argument for the output vector pair + * A, B, C: 1-3 arguments for the inputs + * OPCODE: The assembler opcode for __asm__ on power10 + * VEC: Either __vp_f64 or __vp_f32 for the union field + * VEC_FUNC: 128-bit vector function for use on power8/power9 + * + * The __VPAIR_FP_splat macro takes: + * R: The argument for the output vector pair + * X: The scalar that is to be splat-ed to the vector pair + * VEC: Either __vp_f64 or __vp_f32 for the union field + * + * The __VPAIR_F32_<...> and __VPAIR_F64_<...> macros call the above macros + * with the appropriate structure field to use. + */ + +#undef __VPAIR_FP_SPLAT +#undef __VPAIR_FP_UNARY +#undef __VPAIR_FP_BINARY +#undef __VPAIR_FP_FMA + +#undef __VPAIR_F64_UNARY +#undef __VPAIR_F64_BINARY +#undef __VPAIR_F64_FMA + +#undef __VPAIR_F32_UNARY +#undef __VPAIR_F32_BINARY +#undef __VPAIR_F32_FMA + +/* Operations using a vector pair and __asm__operations. */ +#if __MMA__ && !__VPAIR_NOP10__ + +/* When using __asm__, we need to access the second register. Due to the way + VSX registers were formed by combining the traditional floating point + registers and Altivec registers, we can't use the output modifier %L<n> to + refer to the second register if the VSX register was a traditional Altivec + register. If the value is in VSX registers 34 & 35, %x0 would give 34, but + %L0 would give 1, since 'Altivec' registers start at 0. + + If we are using GAS under Linux, we can use %x0+1 to access the second + register and use the full VSX register set. + + If this include file is used on non-Linux systems, or with a non-GCC + compiler, limit the registers used to the traditional FPR registers so that + we can use %L0. */ + +#if __VPAIR__USE_FPR__ || !__GNUC__ || (!__linux__ && !__ELF__) + +/* Use %0 and %L0 on traditional FPR registers. */ +#define __VPAIR_FP_SPLAT(R, X, VEC) \ + __asm__ ("xxlor %L0,%0,%0" \ + : "=d" ((R)->__vpair) \ + : "0" (__builtin_vec_splats ((X)))) + +#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %0,%1\n\t" OPCODE " %L0,%L1" \ + : "=d" ((R)->__vpair) \ + : "d" ((A)->__vpair)) + +#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %0,%1,$1\n\t" OPCODE " %L0,%L1,%L2" \ + : "=d" ((R)->__vpair) \ + : "d" ((A)->__vpair), "d" ((B)->__vpair)) + +/* Note the 'a' form of the fma instructions must be used. */ +#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %0,%1,%2\n\t" OPCODE " %L0,%L1,%L2" \ + : "=d" ((R)->__vpair) \ + : "d" ((A)->__vpair), "d" ((B)->__vpair), "0" ((C)->__vpair)) + +#else + +/* Use %x0 and %x0+1 on VSX reigsters. */ +#define __VPAIR_FP_SPLAT(R, X, VEC) \ + __asm__ ("xxlor %x0+1,%x0,%x0" \ + : "=wa" ((R)->__vpair) \ + : "0" (__builtin_vec_splats ((X)))) + +#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %x0,%x1\n\t" OPCODE " %x0+1,%x1+1" \ + : "=wa" ((R)->__vpair) \ + : "wa" ((A)->__vpair)) + +#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \ + : "=wa" ((R)->__vpair) \ + : "wa" ((A)->__vpair), "wa" ((B)->__vpair)) + +/* Note the 'a' form of the fma instructions must be used. */ +#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC) \ + __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \ + : "=wa" ((R)->__vpair) \ + : "wa" ((A)->__vpair), "wa" ((B)->__vpair), "0" ((C)->__vpair)) +#endif /* Select whether to use %0/%L0 or %x0/%x0+1. */ + +#else /* vpair support on power8/power9. */ + +/* Pair of vector operations using a built-in function. */ + +#define __VPAIR_FP_SPLAT(R, X, VEC) \ + (R)->VEC[0] = (R)->VEC[1] = __builtin_vec_splats ((X)) + +#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC) \ + do \ + { \ + (R)->VEC[0] = VEC_FUNC ((A)->VEC[0]); \ + (R)->VEC[1] = VEC_FUNC ((A)->VEC[1]); \ + } \ + while (0) + +#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC) \ + do \ + { \ + (R)->VEC[0] = VEC_FUNC ((A)->VEC[0], (B)->VEC[0]); \ + (R)->VEC[1] = VEC_FUNC ((A)->VEC[1], (B)->VEC[1]); \ + } \ + while (0) + +#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC) \ + do \ + { \ + (R)->VEC[0] = VEC_FUNC ((A)->VEC[0], (B)->VEC[0], (C)->VEC[0]); \ + (R)->VEC[1] = VEC_FUNC ((A)->VEC[1], (B)->VEC[1], (C)->VEC[1]); \ + } \ + while (0) + +#endif + +/* 64-bit version of the macros. */ +#define __VPAIR_F64_UNARY(R, A, OPCODE, VEC_FUNC) \ + __VPAIR_FP_UNARY(R, A, OPCODE, __vp_f64, VEC_FUNC) + +#define __VPAIR_F64_BINARY(R, A, B, OPCODE, VEC_FUNC) \ + __VPAIR_FP_BINARY(R, A, B, OPCODE, __vp_f64, VEC_FUNC) + +#define __VPAIR_F64_FMA(R, A, B, C, OPCODE, VEC_FUNC) \ + __VPAIR_FP_FMA(R, A, B, C, OPCODE, __vp_f64, VEC_FUNC) + + +/* 32-bit version of the macros. */ +#define __VPAIR_F32_UNARY(R, A, OPCODE, VEC_FUNC) \ + __VPAIR_FP_UNARY(R, A, OPCODE, __vp_f32, VEC_FUNC) + +#define __VPAIR_F32_BINARY(R, A, B, OPCODE, VEC_FUNC) \ + __VPAIR_FP_BINARY(R, A, B, OPCODE, __vp_f32, VEC_FUNC) + +#define __VPAIR_F32_FMA(R, A, B, C, OPCODE, VEC_FUNC) \ + __VPAIR_FP_FMA(R, A, B, C, OPCODE, __vp_f32, VEC_FUNC) + + +/* Splat functions. */ + +/* 64-bit splat to vector pair. */ + +static inline void +vpair_f64_splat (vector_pair_f64_t *__r, double __x) +{ + __VPAIR_FP_SPLAT (__r, __x, __vp_f64); +} + +/* 32-bit splat to vector pair. */ + +static inline void +vpair_f32_splat (vector_pair_f32_t *__r, float __x) +{ + __VPAIR_FP_SPLAT (__r, __x, __vp_f32); +} + + +/* 64-bit unary functions. */ + +static inline void +vpair_f64_abs (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a) +{ + __VPAIR_F64_UNARY (__r, __a, + "xvabsdp", + __builtin_vec_abs); +} + +static inline void +vpair_f64_nabs (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a) +{ + __VPAIR_F64_UNARY (__r, __a, + "xvnabsdp", + __builtin_vec_nabs); +} + +static inline void +vpair_f64_neg (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a) +{ + __VPAIR_F64_UNARY (__r, __a, + "xvnegdp", + __builtin_vec_neg); +} + +static inline void +vpair_f64_sqrt (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a) +{ + __VPAIR_F64_UNARY (__r, __a, + "xvsqrtdp", + __builtin_vec_sqrt); +} + +/* 32-bit unary functions. */ + +static inline void +vpair_f32_abs (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a) +{ + __VPAIR_F32_UNARY (__r, __a, + "xvabssp", + __builtin_vec_abs); +} + +static inline void +vpair_f32_nabs (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a) +{ + __VPAIR_F32_UNARY (__r, __a, + "xvnabssp", + __builtin_vec_nabs); +} + +static inline void +vpair_f32_neg (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a) +{ + __VPAIR_F32_UNARY (__r, __a, + "xvnegsp", + __builtin_vec_neg); +} + +static inline void +vpair_f32_sqrt (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a) +{ + __VPAIR_F32_UNARY (__r, __a, + "xvsqrtsp", + __builtin_vec_sqrt); +} + + +/* 64-bit binary functions. */ + +static inline void +vpair_f64_add (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvadddp", + __builtin_vec_add); +} + +static inline void +vpair_f64_div (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvdivdp", + __builtin_vec_div); +} + +static inline void +vpair_f64_max (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvmaxdp", + __builtin_vec_max); +} + +static inline void +vpair_f64_min (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvmindp", + __builtin_vec_min); +} + +static inline void +vpair_f64_mul (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvmuldp", + __builtin_vec_mul); +} + +static inline void +vpair_f64_sub (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b) +{ + __VPAIR_F64_BINARY (__r, __a, __b, + "xvsubdp", + __builtin_vec_sub); +} + +/* 32-bit binary functions. */ + +static inline void +vpair_f32_add (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvaddsp", + __builtin_vec_add); +} + +static inline void +vpair_f32_div (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvdivsp", + __builtin_vec_div); +} + +static inline void +vpair_f32_max (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvmaxsp", + __builtin_vec_max); +} + +static inline void +vpair_f32_min (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvminsp", + __builtin_vec_min); +} + +static inline void +vpair_f32_mul (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvmulsp", + __builtin_vec_mul); +} + +static inline void +vpair_f32_sub (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b) +{ + __VPAIR_F32_BINARY (__r, __a, __b, + "xvsubsp", + __builtin_vec_sub); +} + +/* 64-bit fma operations. */ + +static inline void +vpair_f64_fma (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b, + const vector_pair_f64_t *__c) +{ + __VPAIR_F64_FMA (__r, __a, __b, __c, + "xvmaddadp", + __builtin_vsx_xvmadddp); +} + +static inline void +vpair_f64_fms (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b, + const vector_pair_f64_t *__c) +{ + __VPAIR_F64_FMA (__r, __a, __b, __c, + "xvmsubadp", + __builtin_vsx_xvmsubdp); +} + +static inline void +vpair_f64_nfma (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b, + const vector_pair_f64_t *__c) +{ + __VPAIR_F64_FMA (__r, __a, __b, __c, + "xvnmaddadp", + __builtin_vsx_xvnmadddp); +} + +static inline void +vpair_f64_nfms (vector_pair_f64_t *__r, + const vector_pair_f64_t *__a, + const vector_pair_f64_t *__b, + const vector_pair_f64_t *__c) +{ + __VPAIR_F64_FMA (__r, __a, __b, __c, + "xvnmsubadp", + __builtin_vsx_xvnmsubdp); +} +/* 32-bit fma operations. */ + +static inline void +vpair_f32_fma (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b, + const vector_pair_f32_t *__c) +{ + __VPAIR_F32_FMA (__r, __a, __b, __c, + "xvmaddasp", + __builtin_vsx_xvmaddsp); +} + +static inline void +vpair_f32_fms (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b, + const vector_pair_f32_t *__c) +{ + __VPAIR_F32_FMA (__r, __a, __b, __c, + "xvmsubasp", + __builtin_vsx_xvmsubsp); +} + +static inline void +vpair_f32_nfma (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b, + const vector_pair_f32_t *__c) +{ + __VPAIR_F32_FMA (__r, __a, __b, __c, + "xvnmaddasp", + __builtin_vsx_xvnmaddsp); +} + +static inline void +vpair_f32_nfms (vector_pair_f32_t *__r, + const vector_pair_f32_t *__a, + const vector_pair_f32_t *__b, + const vector_pair_f32_t *__c) +{ + __VPAIR_F32_FMA (__r, __a, __b, __c, + "xvnmsubasp", + __builtin_vsx_xvnmsubsp); +} +#endif /* _VECTOR_PAIR_H. */ diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index f46c3df3303..4c9e8c2e313 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -16157,6 +16157,7 @@ instructions, but allow the compiler to schedule those calls. * PowerPC Hardware Transactional Memory Built-in Functions:: * PowerPC Atomic Memory Operation Functions:: * PowerPC Matrix-Multiply Assist Built-in Functions:: +* PowerPC Vector Pair Support:: * PRU Built-in Functions:: * RISC-V Built-in Functions:: * RISC-V Vector Intrinsics:: @@ -24673,6 +24674,103 @@ __vector_pair __builtin_vsx_lxvp (size_t, __vector_pair *); void __builtin_vsx_stxvp (__vector_pair, size_t, __vector_pair *); @end smallexample +@node PowerPC Vector Pair Support +@subsection PowerPC Vector Pair Support +ISA 3.1 (power10) added instructions to load and store pairs of +vectors with a single instruction. + +GCC now provides an include file (@file{vector-pair.h}) on PowerPC +systems that allows users to write code that can write 32-bit and +64-bit floating point code that processes data in 256-bit chunks +rather than 128-bit chunks. + +If the code is compiled on an ISA 3.1 system with MMA enabled, the +vector pair functions will use the @code{__vector_pair} type to have +values in adjacent vectors and do the operation as a pair of +operations. + +If the code is compiled on a VSX system, but not one with MMA enabled, the vector +pair functions will use 2 separate vectors to do the operation. + +Two types are provided: @code{vector_pair_f64_t} is for vector pairs +that will operate on units of 4 64-bit floating point values, and +@code{vector_pair_f32_t} for operating on units of 8 32-bit floating +point values. + +@node PowerPC Vector Pair Support for 64-bit floating point +@subsection PowerPC Vector Pair Support for 64-bit floating point. + +The following functions are provided for operating on vector pairs +that consist of 4 64-bit floating point values: + +@smallexample +void vpair_f64_splat (vector_pair_f64_t *, double); + +void vpair_f64_abs (vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_nabs (vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_neg (vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_sqrt (vector_pair_f64_t *, vector_pair_f64_t *); + +void vpair_f64_add (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_div (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_max (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_min (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_mul (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_sub (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); + +void vpair_f64_fma (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_fms (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_nfma (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_nfms (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +@end smallexample + +@node PowerPC Vector Pair Support for 32-bit floating point +@subsection PowerPC Vector Pair Support for 32-bit floating point. + +The following functions are provided for operating on vector pairs +that consist of 8 32-bit floating point values: + +@smallexample +void vpair_f32_splat (vector_pair_f32_t *, float); + +void vpair_f32_abs (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_nabs (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_neg (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_sqrt (vector_pair_f32_t *, vector_pair_f32_t *); + +void vpair_f32_add (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_div (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_max (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_min (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_mul (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_sub (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); + +void vpair_f32_fma (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_fms (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_nfma (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_nfms (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +@end smallexample + @node PRU Built-in Functions @subsection PRU Built-in Functions diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-1.c b/gcc/testsuite/gcc.target/powerpc/vpair-1.c new file mode 100644 index 00000000000..55772cc44e3 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-1.c @@ -0,0 +1,141 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector builtin code generates the expected instructions for + vector pairs with 4 double elements. */ + +#include <vector-pair.h> + +void +test_add (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvadddp, 1 stxvp. */ + vpair_f64_add (dest, x, y); +} + +void +test_sub (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvsubdp, 1 stxvp. */ + vpair_f64_sub (dest, x, y); +} + +void +test_multiply (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvmuldp, 1 stxvp. */ + vpair_f64_mul (dest, x, y); +} + +void +test_min (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvmindp, 1 stxvp. */ + vpair_f64_min (dest, x, y); +} + +void +test_max (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y) +{ + /* 2 lxvp, 2 xvmaxdp, 1 stxvp. */ + vpair_f64_max (dest, x, y); +} + +void +test_negate (vector_pair_f64_t *dest, + vector_pair_f64_t *x) +{ + /* 1 lxvp, 2 xvnegdp, 1 stxvp. */ + vpair_f64_neg (dest, x); +} + +void +test_abs (vector_pair_f64_t *dest, + vector_pair_f64_t *x) +{ + /* 1 lxvp, 2 xvabsdp, 1 stxvp. */ + vpair_f64_abs (dest, x); +} + +void +test_negative_abs (vector_pair_f64_t *dest, + vector_pair_f64_t *x) +{ + /* 2 lxvp, 2 xvnabsdp, 1 stxvp. */ + vpair_f64_nabs (dest, x); +} + +void +test_sqrt (vector_pair_f64_t *dest, + vector_pair_f64_t *x) +{ + /* 1 lxvp, 2 xvabsdp, 1 stxvp. */ + vpair_f64_sqrt (dest, x); +} + +void +test_fma (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y, + vector_pair_f64_t *z) +{ + /* 2 lxvp, 2 xvmadd{a,m}dp, 1 stxvp. */ + vpair_f64_fma (dest, x, y, z); +} + +void +test_fms (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y, + vector_pair_f64_t *z) +{ + /* 2 lxvp, 2 xvmsub{a,m}dp, 1 stxvp. */ + vpair_f64_fms (dest, x, y, z); +} + +void +test_nfma (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y, + vector_pair_f64_t *z) +{ + /* 2 lxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */ + vpair_f64_nfma (dest, x, y, z); +} + +void +test_nfms (vector_pair_f64_t *dest, + vector_pair_f64_t *x, + vector_pair_f64_t *y, + vector_pair_f64_t *z) +{ + /* 2 lxvp, 2 xvnmsub{a,m}dp, 1 stxvp. */ + vpair_f64_nfms (dest, x, y, z); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 26 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 13 } } */ +/* { dg-final { scan-assembler-times {\mxvabsdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvadddp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmaxdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmindp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmuldp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnabsdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnegdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsqrtdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsubdp\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-2.c b/gcc/testsuite/gcc.target/powerpc/vpair-2.c new file mode 100644 index 00000000000..3030b0b3338 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-2.c @@ -0,0 +1,141 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector builtin code generates the expected instructions for + vector pairs with 4 double elements. */ + +#include <vector-pair.h> + +void +test_add (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvaddsp, 1 stxvp. */ + vpair_f32_add (dest, x, y); +} + +void +test_sub (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvsubsp, 1 stxvp. */ + vpair_f32_sub (dest, x, y); +} + +void +test_multiply (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvmulsp, 1 stxvp. */ + vpair_f32_mul (dest, x, y); +} + +void +test_min (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvminsp, 1 stxvp. */ + vpair_f32_min (dest, x, y); +} + +void +test_max (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y) +{ + /* 2 lxvp, 2 xvmaxsp, 1 stxvp. */ + vpair_f32_max (dest, x, y); +} + +void +test_negate (vector_pair_f32_t *dest, + vector_pair_f32_t *x) +{ + /* 1 lxvp, 2 xvnegsp, 1 stxvp. */ + vpair_f32_neg (dest, x); +} + +void +test_abs (vector_pair_f32_t *dest, + vector_pair_f32_t *x) +{ + /* 1 lxvp, 2 xvabssp, 1 stxvp. */ + vpair_f32_abs (dest, x); +} + +void +test_negative_abs (vector_pair_f32_t *dest, + vector_pair_f32_t *x) +{ + /* 2 lxvp, 2 xvnabssp, 1 stxvp. */ + vpair_f32_nabs (dest, x); +} + +void +test_sqrt (vector_pair_f32_t *dest, + vector_pair_f32_t *x) +{ + /* 1 lxvp, 2 xvabssp, 1 stxvp. */ + vpair_f32_sqrt (dest, x); +} + +void +test_fma (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y, + vector_pair_f32_t *z) +{ + /* 2 lxvp, 2 xvmadd{a,m}sp, 1 stxvp. */ + vpair_f32_fma (dest, x, y, z); +} + +void +test_fms (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y, + vector_pair_f32_t *z) +{ + /* 2 lxvp, 2 xvmsub{a,m}sp, 1 stxvp. */ + vpair_f32_fms (dest, x, y, z); +} + +void +test_nfma (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y, + vector_pair_f32_t *z) +{ + /* 2 lxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */ + vpair_f32_nfma (dest, x, y, z); +} + +void +test_nfms (vector_pair_f32_t *dest, + vector_pair_f32_t *x, + vector_pair_f32_t *y, + vector_pair_f32_t *z) +{ + /* 2 lxvp, 2 xvnmsub{a,m}sp, 1 stxvp. */ + vpair_f32_nfms (dest, x, y, z); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 26 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 13 } } */ +/* { dg-final { scan-assembler-times {\mxvabssp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvaddsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmaxsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvminsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmulsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnabssp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnegsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsqrtsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsubsp\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c new file mode 100644 index 00000000000..d1a1029417f --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c @@ -0,0 +1,15 @@ +/* { dg-do run { target { vsx_hw } } } */ +/* { dg-options "-mvsx -O2 -ffast-math -mno-mma" } */ + +/* + * This test of the double (f64) vector pair functions in vector-pair.h is run + * on VSX systems when the load/store vector pair instructions are not + * available. + * + * The -ffast-math option is used to just use the hardware sqrt, min, and max + * instructions without calling into the library. + * + * The -mno-mma option disables GCC from enabling the __vector_pair type. + */ + +#include "vpair-3.h" diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c new file mode 100644 index 00000000000..d78faf3fed4 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c @@ -0,0 +1,14 @@ +/* { dg-do run { target { power10_hw } } } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ffast-math -mmma" } */ + +/* + * This test of the double (f64) vector pair functions in vector-pair.h is run + * on VSX systems when the load/store vector pair instructions are available. + * + * The -ffast-math option is used to just use the hardware sqrt, min, and max + * instructions without calling into the library. + * + * The -mmma option makes sure GC enables the __vector_pair type. + */ + +#include "vpair-3.h" diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-3.h b/gcc/testsuite/gcc.target/powerpc/vpair-3.h new file mode 100644 index 00000000000..e61ad23dd57 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-3.h @@ -0,0 +1,435 @@ +/* Common include file to test the vector pair double functions. This is run + two times, once compiled for a non-power10 system that does not have the + vector pair load and store instructions, and once with power10 defaults that + has load/store vector pair. */ + +#include <stddef.h> +#include <stdlib.h> +#include <vector-pair.h> + +#ifdef DEBUG +#include <stdio.h> +#endif + +#ifndef NUM +#define NUM 16 +#endif + +static double result1[NUM]; +static double result2[NUM]; +static double in_a[NUM]; +static double in_b[NUM]; +static double in_c[NUM]; + +/* vector pair tests. */ + +void +vpair_abs (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_abs (vr + i, va + i); +} + +void +vpair_nabs (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_nabs (vr + i, va + i); +} + +void +vpair_neg (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_neg (vr + i, va + i); +} + +void +vpair_sqrt (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_sqrt (vr + i, va + i); +} + +void +vpair_add (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_add (vr + i, va + i, vb + i); +} + +void +vpair_sub (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_sub (vr + i, va + i, vb + i); +} + +void +vpair_mul (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_mul (vr + i, va + i, vb + i); +} + +void +vpair_div (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_div (vr + i, va + i, vb + i); +} + +void +vpair_min (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_min (vr + i, va + i, vb + i); +} + +void +vpair_max (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_max (vr + i, va + i, vb + i); +} + +void +vpair_fma (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + vector_pair_f64_t *vc = (vector_pair_f64_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_fma (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_fms (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + vector_pair_f64_t *vc = (vector_pair_f64_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_fms (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_nfma (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + vector_pair_f64_t *vc = (vector_pair_f64_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_nfma (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_nfms (double *r, double *a, double *b, double *c, size_t num) +{ + vector_pair_f64_t *vr = (vector_pair_f64_t *)r; + vector_pair_f64_t *va = (vector_pair_f64_t *)a; + vector_pair_f64_t *vb = (vector_pair_f64_t *)b; + vector_pair_f64_t *vc = (vector_pair_f64_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double)); + + for (i = 0; i < num2; i++) + vpair_f64_nfms (vr + i, va + i, vb + i, vc + i); +} + + +/* scalar tests. */ + +void +scalar_abs (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < 0.0) ? -a[i] : a[i]; +} + +void +scalar_nabs (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < 0.0) ? a[i] : -a[i]; +} + +void +scalar_neg (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = -a[i]; +} + +void +scalar_sqrt (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_sqrt (a[i]); +} + +void +scalar_add (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] + b[i]; +} + +void +scalar_sub (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] - b[i]; +} + +void +scalar_mul (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] * b[i]; +} + +void +scalar_div (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] / b[i]; +} + +void +scalar_min (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < b[i]) ? a[i] : b[i]; +} + +void +scalar_max (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] > b[i]) ? a[i] : b[i]; +} + +void +scalar_fma (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_fma (a[i], b[i], c[i]); +} + +void +scalar_fms (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_fma (a[i], b[i], -c[i]); +} + +void +scalar_nfma (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = - __builtin_fma (a[i], b[i], c[i]); +} + +void +scalar_nfms (double *r, double *a, double *b, double *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = - __builtin_fma (a[i], b[i], -c[i]); +} + + +/* Check results. */ +void +check (const char *name) +{ + size_t i; + + for (i = 0; i < NUM; i++) + if (result1[i] != result2[i]) + { +#ifdef DEBUG + printf ("test #%ld failed, %g != %g, %s (%g, %g, %g).\n", + (long)i, + result1[i], + result2[i], + name, + in_a[i], + in_b[i], + in_c[i]); +#endif + abort (); + } + + return; +} + +typedef void func_t (double *, double *, double *, double *, size_t); + +/* tests to run. */ +struct +{ + func_t *vpair_test; + func_t *scalar_test; + const char *name; +} tests[] = { + { vpair_abs, scalar_abs, "abs" }, + { vpair_nabs, scalar_nabs, "nabs" }, + { vpair_neg, scalar_neg, "neg" }, + { vpair_sqrt, scalar_sqrt, "sqrt" }, + { vpair_add, scalar_add, "add" }, + { vpair_sub, scalar_sub, "sub" }, + { vpair_mul, scalar_mul, "mul" }, + { vpair_div, scalar_div, "div" }, + { vpair_min, scalar_min, "min" }, + { vpair_max, scalar_max, "max" }, + { vpair_fma, scalar_fma, "fma" }, + { vpair_fms, scalar_fms, "fms" }, + { vpair_nfma, scalar_nfma, "nfma" }, + { vpair_nfms, scalar_nfms, "nfms" }, +}; + +/* Run tests. */ + +int +main (void) +{ + size_t i; + + /* Initialize the inputs. */ + for (i = 0; i < NUM; i++) + { + double d = (double)(i + 1); + in_a[i] = d * d; + in_b[i] = d; + in_c[i] = d + 2.0; + } + +#ifdef DEBUG + printf ("Start tests\n"); +#endif + + /* Run the tests. */ + for (i = 0; i < sizeof (tests) / sizeof (tests[0]); i++) + { + tests[i].vpair_test (result1, in_a, in_b, in_c, NUM); + tests[i].scalar_test (result2, in_a, in_b, in_c, NUM); + check (tests[i].name); + } + +#ifdef DEBUG + printf ("End tests\n"); +#endif + + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c new file mode 100644 index 00000000000..f57fbbf8b05 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c @@ -0,0 +1,15 @@ +/* { dg-do run { target { vsx_hw } } } */ +/* { dg-options "-mvsx -O2 -ffast-math -mno-mma" } */ + +/* + * This test of the float (f32) vector pair functions in vector-pair.h is run + * on VSX systems when the load/store vector pair instructions are not + * available. + * + * The -ffast-math option is used to just use the hardware sqrt, min, and max + * instructions without calling into the library. + * + * The -mno-mma option disables GCC from enabling the __vector_pair type. + */ + +#include "vpair-4.h" diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c new file mode 100644 index 00000000000..12291202c16 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c @@ -0,0 +1,14 @@ +/* { dg-do run { target { power10_hw } } } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ffast-math -mmma" } */ + +/* + * This test of the float (f32) vector pair functions in vector-pair.h is run + * on VSX systems when the load/store vector pair instructions are available. + * + * The -ffast-math option is used to just use the hardware sqrt, min, and max + * instructions without calling into the library. + * + * The -mmma option makes sure GC enables the __vector_pair type. + */ + +#include "vpair-4.h" diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-4.h b/gcc/testsuite/gcc.target/powerpc/vpair-4.h new file mode 100644 index 00000000000..1a80cf5e639 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vpair-4.h @@ -0,0 +1,435 @@ +/* Common include file to test the vector pair float functions. This is run + two times, once compiled for a non-power10 system that does not have the + vector pair load and store instructions, and once with power10 defaults that + has load/store vector pair. */ + +#include <stddef.h> +#include <stdlib.h> +#include <vector-pair.h> + +#ifdef DEBUG +#include <stdio.h> +#endif + +#ifndef NUM +#define NUM 16 +#endif + +static float result1[NUM]; +static float result2[NUM]; +static float in_a[NUM]; +static float in_b[NUM]; +static float in_c[NUM]; + +/* vector pair tests. */ + +void +vpair_abs (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_abs (vr + i, va + i); +} + +void +vpair_nabs (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_nabs (vr + i, va + i); +} + +void +vpair_neg (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_neg (vr + i, va + i); +} + +void +vpair_sqrt (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_sqrt (vr + i, va + i); +} + +void +vpair_add (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_add (vr + i, va + i, vb + i); +} + +void +vpair_sub (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_sub (vr + i, va + i, vb + i); +} + +void +vpair_mul (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_mul (vr + i, va + i, vb + i); +} + +void +vpair_div (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_div (vr + i, va + i, vb + i); +} + +void +vpair_min (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_min (vr + i, va + i, vb + i); +} + +void +vpair_max (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_max (vr + i, va + i, vb + i); +} + +void +vpair_fma (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + vector_pair_f32_t *vc = (vector_pair_f32_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_fma (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_fms (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + vector_pair_f32_t *vc = (vector_pair_f32_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_fms (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_nfma (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + vector_pair_f32_t *vc = (vector_pair_f32_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_nfma (vr + i, va + i, vb + i, vc + i); +} + +void +vpair_nfms (float *r, float *a, float *b, float *c, size_t num) +{ + vector_pair_f32_t *vr = (vector_pair_f32_t *)r; + vector_pair_f32_t *va = (vector_pair_f32_t *)a; + vector_pair_f32_t *vb = (vector_pair_f32_t *)b; + vector_pair_f32_t *vc = (vector_pair_f32_t *)c; + + size_t i; + size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float)); + + for (i = 0; i < num2; i++) + vpair_f32_nfms (vr + i, va + i, vb + i, vc + i); +} + + +/* scalar tests. */ + +void +scalar_abs (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < 0.0) ? -a[i] : a[i]; +} + +void +scalar_nabs (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < 0.0) ? a[i] : -a[i]; +} + +void +scalar_neg (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = -a[i]; +} + +void +scalar_sqrt (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_sqrt (a[i]); +} + +void +scalar_add (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] + b[i]; +} + +void +scalar_sub (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] - b[i]; +} + +void +scalar_mul (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] * b[i]; +} + +void +scalar_div (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = a[i] / b[i]; +} + +void +scalar_min (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] < b[i]) ? a[i] : b[i]; +} + +void +scalar_max (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = (a[i] > b[i]) ? a[i] : b[i]; +} + +void +scalar_fma (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_fma (a[i], b[i], c[i]); +} + +void +scalar_fms (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = __builtin_fma (a[i], b[i], -c[i]); +} + +void +scalar_nfma (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = - __builtin_fma (a[i], b[i], c[i]); +} + +void +scalar_nfms (float *r, float *a, float *b, float *c, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + r[i] = - __builtin_fma (a[i], b[i], -c[i]); +} + + +/* Check results. */ +void +check (const char *name) +{ + size_t i; + + for (i = 0; i < NUM; i++) + if (result1[i] != result2[i]) + { +#ifdef DEBUG + printf ("test #%ld failed, %g != %g, %s (%g, %g, %g).\n", + (long)i, + result1[i], + result2[i], + name, + in_a[i], + in_b[i], + in_c[i]); +#endif + abort (); + } + + return; +} + +typedef void func_t (float *, float *, float *, float *, size_t); + +/* tests to run. */ +struct +{ + func_t *vpair_test; + func_t *scalar_test; + const char *name; +} tests[] = { + { vpair_abs, scalar_abs, "abs" }, + { vpair_nabs, scalar_nabs, "nabs" }, + { vpair_neg, scalar_neg, "neg" }, + { vpair_sqrt, scalar_sqrt, "sqrt" }, + { vpair_add, scalar_add, "add" }, + { vpair_sub, scalar_sub, "sub" }, + { vpair_mul, scalar_mul, "mul" }, + { vpair_div, scalar_div, "div" }, + { vpair_min, scalar_min, "min" }, + { vpair_max, scalar_max, "max" }, + { vpair_fma, scalar_fma, "fma" }, + { vpair_fms, scalar_fms, "fms" }, + { vpair_nfma, scalar_nfma, "nfma" }, + { vpair_nfms, scalar_nfms, "nfms" }, +}; + +/* Run tests. */ + +int +main (void) +{ + size_t i; + + /* Initialize the inputs. */ + for (i = 0; i < NUM; i++) + { + float f = (float)(i + 1); + in_a[i] = f * f; + in_b[i] = f; + in_c[i] = f + 2.0f; + } + +#ifdef DEBUG + printf ("Start tests\n"); +#endif + + /* Run the tests. */ + for (i = 0; i < sizeof (tests) / sizeof (tests[0]); i++) + { + tests[i].vpair_test (result1, in_a, in_b, in_c, NUM); + tests[i].scalar_test (result2, in_a, in_b, in_c, NUM); + check (tests[i].name); + } + +#ifdef DEBUG + printf ("End tests\n"); +#endif + + return 0; +} -- 2.46.2 -- Michael Meissner, IBM PO Box 98, Ayer, Massachusetts, USA, 01432 email: meiss...@linux.ibm.com