Christophe Lyon via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > This patch implements support for vectors of booleans to support MVE > predicates, instead of HImode. Since the ABI mandates pred16_t (aka > uint16_t) to represent predicates in intrinsics prototypes, we > introduce a new "predicate" type qualifier so that we can map relevant > builtins HImode arguments and return value to the appropriate vector > of booleans (VxBI). > > We have to update test_vector_ops_duplicate, because it iterates using > an offset in bytes, where we would need to iterate in bits: we stop > iterating when we reach the end of the vector of booleans. > > 2021-09-01 Christophe Lyon <christophe.l...@foss.st.com> > > gcc/ > PR target/100757 > PR target/101325 > * config/arm/arm-builtins.c (arm_type_qualifiers): Add > qualifier_predicate. > (arm_init_simd_builtin_types): Add new simd types. > (arm_init_builtin): Map predicate vectors arguments to HImode. > (arm_expand_builtin_args): Move HImode predicate arguments to VxBI > rtx. Move return value to HImode rtx. > * config/arm/arm-modes.def (V16BI, V8BI, V4BI): New modes. > * config/arm/arm-simd-builtin-types.def (Pred1x16_t, > Pred2x8_t,Pred4x4_t): New. > * simplify-rtx.c (test_vector_ops_duplicate): Avoid going past the > end of the test vector. > > diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c > index 3a9ff8f26b8..771759f0cdd 100644 > --- a/gcc/config/arm/arm-builtins.c > +++ b/gcc/config/arm/arm-builtins.c > @@ -92,7 +92,9 @@ enum arm_type_qualifiers > qualifier_lane_pair_index = 0x1000, > /* Lane indices selected in quadtuplets - must be within range of previous > argument = a vector. */ > - qualifier_lane_quadtup_index = 0x2000 > + qualifier_lane_quadtup_index = 0x2000, > + /* MVE vector predicates. */ > + qualifier_predicate = 0x4000 > }; > > /* The qualifier_internal allows generation of a unary builtin from > @@ -1633,6 +1635,13 @@ arm_init_simd_builtin_types (void) > arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node; > arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node; > > + if (TARGET_HAVE_MVE) > + { > + arm_simd_types[Pred1x16_t].eltype = unsigned_intHI_type_node; > + arm_simd_types[Pred2x8_t].eltype = unsigned_intHI_type_node; > + arm_simd_types[Pred4x4_t].eltype = unsigned_intHI_type_node; > + } > + > for (i = 0; i < nelts; i++) > { > tree eltype = arm_simd_types[i].eltype; > @@ -1780,6 +1789,11 @@ arm_init_builtin (unsigned int fcode, > arm_builtin_datum *d, > if (qualifiers & qualifier_map_mode) > op_mode = d->mode; > > + /* MVE Predicates use HImode as mandated by the ABI: pred16_t is > unsigned > + short. */ > + if (qualifiers & qualifier_predicate) > + op_mode = HImode; > + > /* For pointers, we want a pointer to the basic type > of the vector. */ > if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode)) > @@ -3024,6 +3038,11 @@ arm_expand_builtin_args (rtx target, machine_mode > map_mode, int fcode, > case ARG_BUILTIN_COPY_TO_REG: > if (POINTER_TYPE_P (TREE_TYPE (arg[argc]))) > op[argc] = convert_memory_address (Pmode, op[argc]); > + > + /* MVE uses mve_pred16_t (aka HImode) for vectors of predicates. > */ > + if (GET_MODE_CLASS (mode[argc]) == MODE_VECTOR_BOOL) > + op[argc] = gen_lowpart (mode[argc], op[argc]); > + > /*gcc_assert (GET_MODE (op[argc]) == mode[argc]); */ > if (!(*insn_data[icode].operand[opno].predicate) > (op[argc], mode[argc])) > @@ -3229,6 +3248,13 @@ constant_arg: > else > emit_insn (insn); > > + if (GET_MODE_CLASS (tmode) == MODE_VECTOR_BOOL) > + { > + rtx HItarget = gen_reg_rtx (HImode); > + emit_move_insn (HItarget, gen_lowpart (HImode, target)); > + return HItarget; > + } > + > return target; > } > > diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def > index a5e74ba3943..b414a709a62 100644 > --- a/gcc/config/arm/arm-modes.def > +++ b/gcc/config/arm/arm-modes.def > @@ -84,6 +84,11 @@ VECTOR_MODE (FLOAT, BF, 2); /* V2BF. */ > VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */ > VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */ > > +/* Predicates for MVE. */ > +VECTOR_BOOL_MODE (V16BI, 16, 2); > +VECTOR_BOOL_MODE (V8BI, 8, 2); > +VECTOR_BOOL_MODE (V4BI, 4, 2); > + > /* Fraction and accumulator vector modes. */ > VECTOR_MODES (FRACT, 4); /* V4QQ V2HQ */ > VECTOR_MODES (UFRACT, 4); /* V4UQQ V2UHQ */ > diff --git a/gcc/config/arm/arm-simd-builtin-types.def > b/gcc/config/arm/arm-simd-builtin-types.def > index c19a1b6e3eb..d3987985b4c 100644 > --- a/gcc/config/arm/arm-simd-builtin-types.def > +++ b/gcc/config/arm/arm-simd-builtin-types.def > @@ -51,3 +51,7 @@ > ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20) > ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20) > ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20) > + > + ENTRY (Pred1x16_t, V16BI, unsigned, 16, uint16, 21) > + ENTRY (Pred2x8_t, V8BI, unsigned, 8, uint16, 21) > + ENTRY (Pred4x4_t, V4BI, unsigned, 4, uint16, 21) > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c > index a719f57870f..1453f984f99 100644 > --- a/gcc/simplify-rtx.c > +++ b/gcc/simplify-rtx.c > @@ -7642,6 +7642,13 @@ test_vector_ops_duplicate (machine_mode mode, rtx > scalar_reg) > rtx mask = GEN_INT ((HOST_WIDE_INT_1U << i) | (i + 1)); > rtx vm = gen_rtx_VEC_MERGE (mode, duplicate, vector_reg, mask); > poly_uint64 offset = i * GET_MODE_SIZE (inner_mode); > + > + /* OFFSET is in bytes, so stop testing when we go past the end of a > + vector of booleans, where we would need an offset in bits. */ > + if ((GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) > + && (maybe_ge (offset, GET_MODE_SIZE (mode)))) > + break; > +
I think we should skip the whole for loop for vector booleans. Although the offset is in bytes, the vec_merge indices are still in elements (usually bits) and so the loop will test something invalid for i != 0. OK with that change, thanks. Richard > ASSERT_RTX_EQ (scalar_reg, > simplify_gen_subreg (inner_mode, vm, > mode, offset));