Hi, Fix codegen implementation for the builtin vec_pack_to_short_fp32.
Regtests are underway against powerpc64 (power7be,power8le,power9le). (this is a power9 builtin, so should be a noop for most of those). OK for trunk and backports? Thanks -Will [gcc] target pr/94954 * config/rs6000/altivec.h (vec_pack_to_short_fp32): Update. * config/rs6000/altivec.md (UNSPEC_CONVERT_4F32_8F16): New unspec. (convert_4f32_8f16): New define_expand. * config/rs6000/rs6000-builtin.def (convert_4f32_8f16): New builtin define and overload. * config/rs6000/rs6000-call.c (P9V_BUILTIN_VEC_CONVERT_4F32_8F16): New overloaded builtin entry. * config/rs6000/vsx.md (UNSPEC_VSX_XVCVSPHP): New unspec. (vsx_xvcvsphp): New define_insn. [testsuite] * testsuite/gcc.target/powerpc/builtins-1-p9-runnable.c: Update. diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h index 0a7e8ab..ab10025 100644 --- a/gcc/config/rs6000/altivec.h +++ b/gcc/config/rs6000/altivec.h @@ -431,11 +431,11 @@ /* Vector additions added in ISA 3.0. */ #define vec_first_match_index __builtin_vec_first_match_index #define vec_first_match_or_eos_index __builtin_vec_first_match_or_eos_index #define vec_first_mismatch_index __builtin_vec_first_mismatch_index #define vec_first_mismatch_or_eos_index __builtin_vec_first_mismatch_or_eos_index -#define vec_pack_to_short_fp32 __builtin_vec_convert_4f32_8i16 +#define vec_pack_to_short_fp32 __builtin_vec_convert_4f32_8f16 #define vec_parity_lsbb __builtin_vec_vparity_lsbb #define vec_vctz __builtin_vec_vctz #define vec_cnttz __builtin_vec_vctz #define vec_vctzb __builtin_vec_vctzb #define vec_vctzd __builtin_vec_vctzd diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 159f24e..855e7cc 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -78,10 +78,11 @@ UNSPEC_VUNPACK_HI_SIGN_DIRECT UNSPEC_VUNPACK_LO_SIGN_DIRECT UNSPEC_VUPKHPX UNSPEC_VUPKLPX UNSPEC_CONVERT_4F32_8I16 + UNSPEC_CONVERT_4F32_8F16 UNSPEC_DST UNSPEC_DSTT UNSPEC_DSTST UNSPEC_DSTSTT UNSPEC_LVSL @@ -3215,10 +3216,32 @@ emit_insn (gen_altivec_vctuxs (rtx_tmp_lo, operands[2], const0_rtx)); emit_insn (gen_altivec_vpkswss (operands[0], rtx_tmp_hi, rtx_tmp_lo)); DONE; }) + +;; Convert two vector F32 to packed vector f16. +(define_expand "convert_4f32_8f16" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V4SF 1 "register_operand" "v") + (match_operand:V4SF 2 "register_operand" "v")] + UNSPEC_CONVERT_4F32_8F16))] + "TARGET_P9_VECTOR" +{ + rtx rtx_tmp_hi = gen_reg_rtx (V4SImode); + rtx rtx_tmp_lo = gen_reg_rtx (V4SImode); + + emit_insn (gen_vsx_xvcvsphp (rtx_tmp_hi, operands[1] )); + emit_insn (gen_vsx_xvcvsphp (rtx_tmp_lo, operands[2] )); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_altivec_vpkuwum (operands[0], rtx_tmp_lo, rtx_tmp_hi)); + else + emit_insn (gen_altivec_vpkuwum (operands[0], rtx_tmp_hi, rtx_tmp_lo)); + DONE; +}) + + ;; Generate ;; xxlxor/vxor SCRATCH0,SCRATCH0,SCRATCH0 ;; vsubu?m SCRATCH2,SCRATCH1,%1 ;; vmaxs? %0,%1,SCRATCH2" (define_expand "abs<mode>2" diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index 8b1ddb0..47e9137 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -2208,10 +2208,11 @@ BU_P8V_OVERLOAD_3 (VPERMXOR, "vpermxor") /* ISA 3.0 vector overloaded 2-argument functions. */ BU_P9V_AV_2 (VSLV, "vslv", CONST, vslv) BU_P9V_AV_2 (VSRV, "vsrv", CONST, vsrv) BU_P9V_AV_2 (CONVERT_4F32_8I16, "convert_4f32_8i16", CONST, convert_4f32_8i16) +BU_P9V_AV_2 (CONVERT_4F32_8F16, "convert_4f32_8f16", CONST, convert_4f32_8f16) BU_P9V_AV_2 (VFIRSTMATCHINDEX_V16QI, "first_match_index_v16qi", CONST, first_match_index_v16qi) BU_P9V_AV_2 (VFIRSTMATCHINDEX_V8HI, "first_match_index_v8hi", CONST, first_match_index_v8hi) @@ -2238,10 +2239,11 @@ BU_P9V_AV_2 (VFIRSTMISMATCHOREOSINDEX_V4SI, "first_mismatch_or_eos_index_v4si", /* ISA 3.0 vector overloaded 2-argument functions. */ BU_P9V_OVERLOAD_2 (VSLV, "vslv") BU_P9V_OVERLOAD_2 (VSRV, "vsrv") BU_P9V_OVERLOAD_2 (CONVERT_4F32_8I16, "convert_4f32_8i16") +BU_P9V_OVERLOAD_2 (CONVERT_4F32_8F16, "convert_4f32_8f16") /* 2 argument vector functions added in ISA 3.0 (power9). */ BU_P9V_AV_2 (VADUB, "vadub", CONST, vaduv16qi3) BU_P9V_AV_2 (VADUH, "vaduh", CONST, vaduv8hi3) BU_P9V_AV_2 (VADUW, "vaduw", CONST, vaduv4si3) diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 0ac8054..9708d7e 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -1975,10 +1975,12 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { P8V_BUILTIN_VEC_NEG, P8V_BUILTIN_NEG_V2DF, RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 }, { P9V_BUILTIN_VEC_CONVERT_4F32_8I16, P9V_BUILTIN_CONVERT_4F32_8I16, RS6000_BTI_unsigned_V8HI, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 }, + { P9V_BUILTIN_VEC_CONVERT_4F32_8F16, P9V_BUILTIN_CONVERT_4F32_8F16, + RS6000_BTI_unsigned_V8HI, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 }, { P9V_BUILTIN_VEC_VFIRSTMATCHINDEX, P9V_BUILTIN_VFIRSTMATCHINDEX_V16QI, RS6000_BTI_UINTSI, RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0 }, { P9V_BUILTIN_VEC_VFIRSTMATCHINDEX, P9V_BUILTIN_VFIRSTMATCHINDEX_V16QI, RS6000_BTI_UINTSI, RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0 }, diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 2a28215..38908a5 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -295,10 +295,11 @@ UNSPEC_VSX_DIVSD UNSPEC_VSX_DIVUD UNSPEC_VSX_MULSD UNSPEC_VSX_SIGN_EXTEND UNSPEC_VSX_XVCVSPSXDS + UNSPEC_VSX_XVCVSPHP UNSPEC_VSX_VSLO UNSPEC_VSX_EXTRACT UNSPEC_VSX_SXEXPDP UNSPEC_VSX_SXSIG UNSPEC_VSX_SIEXPDP @@ -2177,10 +2178,19 @@ UNSPEC_VSX_CVHPSP))] "TARGET_P9_VECTOR" "xvcvhpsp %x0,%x1" [(set_attr "type" "vecfloat")]) +;; Generate xvcvsphp +(define_insn "vsx_xvcvsphp" + [(set (match_operand:V4SI 0 "register_operand" "=wa") + (unspec:V4SI [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_VSX_XVCVSPHP))] + "TARGET_P9_VECTOR" + "xvcvsphp %x0,%x1" +[(set_attr "type" "vecfloat")]) + ;; xscvdpsp used for splat'ing a scalar to V4SF, knowing that the internal SF ;; format of scalars is actually DF. (define_insn "vsx_xscvdpsp_scalar" [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")] diff --git a/gcc/testsuite/gcc.target/powerpc/builtins-1-p9-runnable.c b/gcc/testsuite/gcc.target/powerpc/builtins-1-p9-runnable.c index 0e4ab48..038ba44 100644 --- a/gcc/testsuite/gcc.target/powerpc/builtins-1-p9-runnable.c +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1-p9-runnable.c @@ -1,25 +1,37 @@ /* { dg-do run { target { powerpc*-*-linux* && { lp64 && p9vector_hw } } } } */ /* { dg-require-effective-target powerpc_p9vector_ok } */ /* { dg-options "-O2 -mdejagnu-cpu=power9" } */ #include <altivec.h> +#include <stdio.h> void abort (void); int main() { int i; vector float vfa, vfb; - vector unsigned short vur, vuexpt; + vector unsigned short vur, vresult; - vfa = (vector float){3.4, 5.0, 20.0, 50.9 }; - vfb = (vector float){10.0, 40.0, 70.0, 100.0 }; - vuexpt = (vector unsigned short){ 3, 5, 20, 50, - 10, 40, 70, 100}; + vfa = (vector float){0.4, 1.6, 20.0, 99.9 }; + vfb = (vector float){10.0, -2.0, 70.0, 999.0 }; + /* Results from converting those single-precion floats to half-precision packed values. */ + vresult = (vector unsigned short) { 0x3666, 0x3e66, 0x4d00, 0x563e, + 0x4900, 0xc000, 0x5460, 0x63ce }; + + /* convert from single-precion float to unsigned short and pack */ vur = vec_pack_to_short_fp32 (vfa, vfb); +#ifdef DEBUG + for(i = 0; i< 4; i++) { printf("i=[%d] %f \n",i,vfa[i]); } + for(i = 0; i< 4; i++) { printf("i=[%d] %f \n",i+4,vfb[i]); } + for(i = 0; i< 8; i++) { printf("i=[%d] %d \n",i,vur[i]); } +#endif + for(i = 0; i< 8; i++) { - if (vur[i] != vuexpt[i]) + if (vur[i] != vresult[i]) { + printf("i=[%d] 0x%x != 0x%x \n",i,vur[i],vresult[i]); abort(); + } } }