Christophe Lyon via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> This patch implements support for vectors of booleans to support MVE
> predicates, instead of HImode.  Since the ABI mandates pred16_t (aka
> uint16_t) to represent predicates in intrinsics prototypes, we
> introduce a new "predicate" type qualifier so that we can map relevant
> builtins HImode arguments and return value to the appropriate vector
> of booleans (VxBI).
>
> We have to update test_vector_ops_duplicate, because it iterates using
> an offset in bytes, where we would need to iterate in bits: we stop
> iterating when we reach the end of the vector of booleans.
>
> 2021-09-01  Christophe Lyon  <christophe.l...@foss.st.com>
>
>       gcc/
>       PR target/100757
>       PR target/101325
>       * config/arm/arm-builtins.c (arm_type_qualifiers): Add 
> qualifier_predicate.
>       (arm_init_simd_builtin_types): Add new simd types.
>       (arm_init_builtin): Map predicate vectors arguments to HImode.
>       (arm_expand_builtin_args): Move HImode predicate arguments to VxBI
>       rtx. Move return value to HImode rtx.
>       * config/arm/arm-modes.def (V16BI, V8BI, V4BI): New modes.
>       * config/arm/arm-simd-builtin-types.def (Pred1x16_t,
>       Pred2x8_t,Pred4x4_t): New.
>       * simplify-rtx.c (test_vector_ops_duplicate): Avoid going past the
>       end of the test vector.
>
> diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
> index 3a9ff8f26b8..771759f0cdd 100644
> --- a/gcc/config/arm/arm-builtins.c
> +++ b/gcc/config/arm/arm-builtins.c
> @@ -92,7 +92,9 @@ enum arm_type_qualifiers
>    qualifier_lane_pair_index = 0x1000,
>    /* Lane indices selected in quadtuplets - must be within range of previous
>       argument = a vector.  */
> -  qualifier_lane_quadtup_index = 0x2000
> +  qualifier_lane_quadtup_index = 0x2000,
> +  /* MVE vector predicates.  */
> +  qualifier_predicate = 0x4000
>  };
>  
>  /*  The qualifier_internal allows generation of a unary builtin from
> @@ -1633,6 +1635,13 @@ arm_init_simd_builtin_types (void)
>    arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node;
>    arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node;
>  
> +  if (TARGET_HAVE_MVE)
> +    {
> +      arm_simd_types[Pred1x16_t].eltype = unsigned_intHI_type_node;
> +      arm_simd_types[Pred2x8_t].eltype = unsigned_intHI_type_node;
> +      arm_simd_types[Pred4x4_t].eltype = unsigned_intHI_type_node;
> +    }
> +
>    for (i = 0; i < nelts; i++)
>      {
>        tree eltype = arm_simd_types[i].eltype;
> @@ -1780,6 +1789,11 @@ arm_init_builtin (unsigned int fcode, 
> arm_builtin_datum *d,
>        if (qualifiers & qualifier_map_mode)
>       op_mode = d->mode;
>  
> +      /* MVE Predicates use HImode as mandated by the ABI: pred16_t is 
> unsigned
> +      short.  */
> +      if (qualifiers & qualifier_predicate)
> +     op_mode = HImode;
> +
>        /* For pointers, we want a pointer to the basic type
>        of the vector.  */
>        if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode))
> @@ -3024,6 +3038,11 @@ arm_expand_builtin_args (rtx target, machine_mode 
> map_mode, int fcode,
>           case ARG_BUILTIN_COPY_TO_REG:
>             if (POINTER_TYPE_P (TREE_TYPE (arg[argc])))
>               op[argc] = convert_memory_address (Pmode, op[argc]);
> +
> +           /* MVE uses mve_pred16_t (aka HImode) for vectors of predicates.  
> */
> +           if (GET_MODE_CLASS (mode[argc]) == MODE_VECTOR_BOOL)
> +             op[argc] = gen_lowpart (mode[argc], op[argc]);
> +
>             /*gcc_assert (GET_MODE (op[argc]) == mode[argc]); */
>             if (!(*insn_data[icode].operand[opno].predicate)
>                 (op[argc], mode[argc]))
> @@ -3229,6 +3248,13 @@ constant_arg:
>    else
>      emit_insn (insn);
>  
> +  if (GET_MODE_CLASS (tmode) == MODE_VECTOR_BOOL)
> +    {
> +      rtx HItarget = gen_reg_rtx (HImode);
> +      emit_move_insn (HItarget, gen_lowpart (HImode, target));
> +      return HItarget;
> +    }
> +
>    return target;
>  }
>  
> diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def
> index a5e74ba3943..b414a709a62 100644
> --- a/gcc/config/arm/arm-modes.def
> +++ b/gcc/config/arm/arm-modes.def
> @@ -84,6 +84,11 @@ VECTOR_MODE (FLOAT, BF, 2);   /*                 V2BF.  */
>  VECTOR_MODE (FLOAT, BF, 4);   /*              V4BF.  */
>  VECTOR_MODE (FLOAT, BF, 8);   /*              V8BF.  */
>  
> +/* Predicates for MVE.  */
> +VECTOR_BOOL_MODE (V16BI, 16, 2);
> +VECTOR_BOOL_MODE (V8BI, 8, 2);
> +VECTOR_BOOL_MODE (V4BI, 4, 2);
> +
>  /* Fraction and accumulator vector modes.  */
>  VECTOR_MODES (FRACT, 4);      /* V4QQ  V2HQ */
>  VECTOR_MODES (UFRACT, 4);     /* V4UQQ V2UHQ */
> diff --git a/gcc/config/arm/arm-simd-builtin-types.def 
> b/gcc/config/arm/arm-simd-builtin-types.def
> index c19a1b6e3eb..d3987985b4c 100644
> --- a/gcc/config/arm/arm-simd-builtin-types.def
> +++ b/gcc/config/arm/arm-simd-builtin-types.def
> @@ -51,3 +51,7 @@
>    ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20)
>    ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20)
>    ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20)
> +
> +  ENTRY (Pred1x16_t, V16BI, unsigned, 16, uint16, 21)
> +  ENTRY (Pred2x8_t, V8BI, unsigned, 8, uint16, 21)
> +  ENTRY (Pred4x4_t, V4BI, unsigned, 4, uint16, 21)
> diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
> index a719f57870f..1453f984f99 100644
> --- a/gcc/simplify-rtx.c
> +++ b/gcc/simplify-rtx.c
> @@ -7642,6 +7642,13 @@ test_vector_ops_duplicate (machine_mode mode, rtx 
> scalar_reg)
>         rtx mask = GEN_INT ((HOST_WIDE_INT_1U << i) | (i + 1));
>         rtx vm = gen_rtx_VEC_MERGE (mode, duplicate, vector_reg, mask);
>         poly_uint64 offset = i * GET_MODE_SIZE (inner_mode);
> +
> +       /* OFFSET is in bytes, so stop testing when we go past the end of a
> +          vector of booleans, where we would need an offset in bits.  */
> +       if ((GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
> +           && (maybe_ge (offset, GET_MODE_SIZE (mode))))
> +         break;
> +

I think we should skip the whole for loop for vector booleans.  Although the
offset is in bytes, the vec_merge indices are still in elements (usually
bits) and so the loop will test something invalid for i != 0.

OK with that change, thanks.

Richard

>         ASSERT_RTX_EQ (scalar_reg,
>                        simplify_gen_subreg (inner_mode, vm,
>                                             mode, offset));

Reply via email to