Hi, ISA 3.0 introduces new instructions vrlwmi, vrldmi, vrlwnm, and vrldnm. This patch provides access to them via built-ins, including the vec_rlmi and vec_rlnm built-ins mandated by Appendix A of the ELFv2 ABI document. I also added a vec_vrlnm built-in, which is a more direct translation of the vrlwnm and vrldnm instructions that some users might prefer.
This has been bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions. I am in process of testing them on a big-endian system as well. Provided there are no problems there, is this ok for trunk? Thanks, Bill [gcc] 2017-01-16 Bill Schmidt <wschm...@linux.vnet.ibm.com> * config/rs6000/altivec.h (vec_rlmi): New #define. (vec_vrlnm): Likewise. (vec_rlnm): Likewise. * config/rs6000/altivec.md (UNSPEC_VRLMI): New UNSPEC enum value. (UNSPEC_VRLNM): Likewise. (VIlong): New mode iterator. (altivec_vrl<VI_char>mi): New define_insn. (altivec_vrl<VI_char>nm): Likewise. * config/rs6000/rs6000-builtin.def (VRLWNM): New monomorphic function entry. (VRLDNM): Likewise. (RLNM): New polymorphic function entry. (VRLWMI): New monomorphic function entry. (VRLDMI): Likewise. (RLMI): New polymorphic function entry. * config/rs6000/r6000-c.c (altivec_overloaded_builtin_table): Add new entries for P9V_BUILTIN_VEC_RLMI and P9V_BUILTIN_VEC_RLNM. * doc/extend.texi: Add description of vec_rlmi, vec_rlnm, and vec_vrlnm. [gcc/testsuite] 2017-01-16 Bill Schmidt <wschm...@linux.vnet.ibm.com> * vec-rlmi-rlnm.c: New file. Index: gcc/config/rs6000/altivec.h =================================================================== --- gcc/config/rs6000/altivec.h (revision 244498) +++ gcc/config/rs6000/altivec.h (working copy) @@ -168,6 +168,9 @@ #define vec_re __builtin_vec_re #define vec_round __builtin_vec_round #define vec_recipdiv __builtin_vec_recipdiv +#define vec_rlmi __builtin_vec_rlmi +#define vec_vrlnm __builtin_vec_rlnm +#define vec_rlnm(a,b,c) (__builtin_vec_rlnm(a,(b<<8)|c)) #define vec_rsqrt __builtin_vec_rsqrt #define vec_rsqrte __builtin_vec_rsqrte #define vec_vsubfp __builtin_vec_vsubfp Index: gcc/config/rs6000/altivec.md =================================================================== --- gcc/config/rs6000/altivec.md (revision 244498) +++ gcc/config/rs6000/altivec.md (working copy) @@ -156,6 +156,8 @@ UNSPEC_CMPRB UNSPEC_CMPRB2 UNSPEC_CMPEQB + UNSPEC_VRLMI + UNSPEC_VRLNM ]) (define_c_enum "unspecv" @@ -168,8 +170,10 @@ ;; Like VI, defined in vector.md, but add ISA 2.07 integer vector ops (define_mode_iterator VI2 [V4SI V8HI V16QI V2DI]) -;; Short vec in modes +;; Short vec int modes (define_mode_iterator VIshort [V8HI V16QI]) +;; Longer vec int modes for rotate/mask ops +(define_mode_iterator VIlong [V2DI V4SI]) ;; Vec float modes (define_mode_iterator VF [V4SF]) ;; Vec modes, pity mode iterators are not composable @@ -1627,6 +1631,25 @@ "vrl<VI_char> %0,%1,%2" [(set_attr "type" "vecsimple")]) +(define_insn "altivec_vrl<VI_char>mi" + [(set (match_operand:VIlong 0 "register_operand" "=v") + (unspec:VIlong [(match_operand:VIlong 1 "register_operand" "0") + (match_operand:VIlong 2 "register_operand" "v") + (match_operand:VIlong 3 "register_operand" "v")] + UNSPEC_VRLMI))] + "TARGET_P9_VECTOR" + "vrl<VI_char>mi %0,%2,%3" + [(set_attr "type" "veclogical")]) + +(define_insn "altivec_vrl<VI_char>nm" + [(set (match_operand:VIlong 0 "register_operand" "=v") + (unspec:VIlong [(match_operand:VIlong 1 "register_operand" "v") + (match_operand:VIlong 2 "register_operand" "v")] + UNSPEC_VRLNM))] + "TARGET_P9_VECTOR" + "vrl<VI_char>nm %0,%1,%2" + [(set_attr "type" "veclogical")]) + (define_insn "altivec_vsl" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") Index: gcc/config/rs6000/rs6000-builtin.def =================================================================== --- gcc/config/rs6000/rs6000-builtin.def (revision 244498) +++ gcc/config/rs6000/rs6000-builtin.def (working copy) @@ -1918,6 +1918,8 @@ BU_P9V_OVERLOAD_2 (VSRV, "vsrv") BU_P9V_AV_2 (VADUB, "vadub", CONST, vaduv16qi3) BU_P9V_AV_2 (VADUH, "vaduh", CONST, vaduv8hi3) BU_P9V_AV_2 (VADUW, "vaduw", CONST, vaduv4si3) +BU_P9V_AV_2 (VRLWNM, "vrlwnm", CONST, altivec_vrlwnm) +BU_P9V_AV_2 (VRLDNM, "vrldnm", CONST, altivec_vrldnm) /* ISA 3.0 vector overloaded 2 argument functions. */ BU_P9V_OVERLOAD_2 (VADU, "vadu") @@ -1924,7 +1926,15 @@ BU_P9V_OVERLOAD_2 (VADU, "vadu") BU_P9V_OVERLOAD_2 (VADUB, "vadub") BU_P9V_OVERLOAD_2 (VADUH, "vaduh") BU_P9V_OVERLOAD_2 (VADUW, "vaduw") +BU_P9V_OVERLOAD_2 (RLNM, "rlnm") +/* ISA 3.0 3-argument vector functions. */ +BU_P9V_AV_3 (VRLWMI, "vrlwmi", CONST, altivec_vrlwmi) +BU_P9V_AV_3 (VRLDMI, "vrldmi", CONST, altivec_vrldmi) + +/* ISA 3.0 vector overloaded 3-argument functions. */ +BU_P9V_OVERLOAD_3 (RLMI, "rlmi") + /* 1 argument vsx scalar functions added in ISA 3.0 (power9). */ BU_P9V_64BIT_VSX_1 (VSEEDP, "scalar_extract_exp", CONST, xsxexpdp) BU_P9V_64BIT_VSX_1 (VSESDP, "scalar_extract_sig", CONST, xsxsigdp) Index: gcc/config/rs6000/rs6000-c.c =================================================================== --- gcc/config/rs6000/rs6000-c.c (revision 244498) +++ gcc/config/rs6000/rs6000-c.c (working copy) @@ -2202,6 +2202,18 @@ const struct altivec_builtin_types altivec_overloa RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_unsigned_V16QI, 0 }, { ALTIVEC_BUILTIN_VEC_VRLB, ALTIVEC_BUILTIN_VRLB, RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0 }, + { P9V_BUILTIN_VEC_RLMI, P9V_BUILTIN_VRLWMI, + RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, + RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI }, + { P9V_BUILTIN_VEC_RLMI, P9V_BUILTIN_VRLDMI, + RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, + RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI }, + { P9V_BUILTIN_VEC_RLNM, P9V_BUILTIN_VRLWNM, + RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, + RS6000_BTI_unsigned_V4SI, 0 }, + { P9V_BUILTIN_VEC_RLNM, P9V_BUILTIN_VRLDNM, + RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, + RS6000_BTI_unsigned_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_SL, ALTIVEC_BUILTIN_VSLB, RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_unsigned_V16QI, 0 }, { ALTIVEC_BUILTIN_VEC_SL, ALTIVEC_BUILTIN_VSLB, Index: gcc/doc/extend.texi =================================================================== --- gcc/doc/extend.texi (revision 244498) +++ gcc/doc/extend.texi (working copy) @@ -18179,6 +18179,43 @@ If any of the enabled test conditions is true, the in the result vector is -1. Otherwise (all of the enabled test conditions are false), the corresponding entry of the result vector is 0. +The following built-in functions are available for the PowerPC family +of processors, starting with ISA 3.0 or later (@option{-mcpu=power9}): +@smallexample +vector unsigned int vec_rlmi (vector unsigned int, vector unsigned int, + vector unsigned int); +vector unsigned long long vec_rlmi (vector unsigned long long, + vector unsigned long long, + vector unsigned long long); +vector unsigned int vec_rlnm (vector unsigned int, vector unsigned int, + vector unsigned int); +vector unsigned long long vec_rlnm (vector unsigned long long, + vector unsigned long long, + vector unsigned long long); +vector unsigned int vec_vrlnm (vector unsigned int, vector unsigned int); +vector unsigned long long vec_vrlnm (vector unsigned long long, + vector unsigned long long); +@end smallexample + +The result of @code{vec_rlmi} is obtained by rotating each element of +the first argument vector left and inserting it under mask into the +second argument vector. The third argument vector contains the mask +beginning in bits 11:15, the mask end in bits 19:23, and the shift +count in bits 27:31, of each element. + +The result of @code{vec_rlnm} is obtained by rotating each element of +the first argument vector left and ANDing it with a mask specified by +the second and third argument vectors. The second argument vector +contains the shift count for each element in the low-order byte. The +third argument vector contains the mask end for each element in the +low-order byte, with the mask begin in the next higher byte. + +The result of @code{vec_vrlnm} is obtained by rotating each element +of the first argument vector left and ANDing it with a mask. The +second argument vector contains the mask beginning in bits 11:15, +the mask end in bits 19:23, and the shift count in bits 27:31, +of each element. + If the cryptographic instructions are enabled (@option{-mcrypto} or @option{-mcpu=power8}), the following builtins are enabled. Index: gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c (working copy) @@ -0,0 +1,69 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-options "-O2 -mcpu=power9" } */ + +#include <altivec.h> + +vector unsigned int +rlmi_test_1 (vector unsigned int x, vector unsigned int y, + vector unsigned int z) +{ + return vec_rlmi (x, y, z); +} + +vector unsigned long long +rlmi_test_2 (vector unsigned long long x, vector unsigned long long y, + vector unsigned long long z) +{ + return vec_rlmi (x, y, z); +} + +vector unsigned int +vrlnm_test_1 (vector unsigned int x, vector unsigned int y) +{ + return vec_vrlnm (x, y); +} + +vector unsigned long long +vrlnm_test_2 (vector unsigned long long x, vector unsigned long long y) +{ + return vec_vrlnm (x, y); +} + +vector unsigned int +rlnm_test_1 (vector unsigned int x, vector unsigned int y, + vector unsigned int z) +{ + return vec_rlnm (x, y, z); +} + +vector unsigned long long +rlnm_test_2 (vector unsigned long long x, vector unsigned long long y, + vector unsigned long long z) +{ + return vec_rlnm (x, y, z); +} + +/* Expected code generation for rlmi_test_1 is vrlwmi. + Expected code generation for rlmi_test_2 is vrldmi. + Expected code generation for vrlnm_test_1 is vrlwnm. + Expected code generation for vrlnm_test_2 is vrldnm. + Expected code generation for the others is more complex, because + the second and third arguments are combined by a shift and OR, + and because there is no splat-immediate doubleword. + - For rlnm_test_1: vspltisw, vslw, xxlor, vrlwnm. + - For rlnm_test_2: xxspltib, vextsb2d, vsld, xxlor, vrldnm. + There is a choice of splat instructions in both cases, so we + just check for "splt". */ + +/* { dg-final { scan-assembler-times "vrlwmi" 1 } } */ +/* { dg-final { scan-assembler-times "vrldmi" 1 } } */ +/* { dg-final { scan-assembler-times "splt" 2 } } */ +/* { dg-final { scan-assembler-times "vextsb2d" 1 } } */ +/* { dg-final { scan-assembler-times "vslw" 1 } } */ +/* { dg-final { scan-assembler-times "vsld" 1 } } */ +/* { dg-final { scan-assembler-times "xxlor" 2 } } */ +/* { dg-final { scan-assembler-times "vrlwnm" 2 } } */ +/* { dg-final { scan-assembler-times "vrldnm" 2 } } */