arm: Implement SVE Integer Compare - Vectors Group

Peter Maydell Fri, 23 Feb 2018 08:33:12 -0800

On 17 February 2018 at 18:22, Richard Henderson
<richard.hender...@linaro.org> wrote:
> Signed-off-by: Richard Henderson <richard.hender...@linaro.org>
> ---


> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> index 86cd792cdf..ae433861f8 100644
> --- a/target/arm/sve_helper.c
> +++ b/target/arm/sve_helper.c
> @@ -46,14 +46,14 @@
>   *
>   * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
>   * and bit 0 set if C is set.
> - *
> - * This is an iterative function, called for each Pd and Pg word
> - * moving forward.
>   */
>
>  /* For no G bits set, NZCV = C.  */
>  #define PREDTEST_INIT  1
>
> +/* This is an iterative function, called for each Pd and Pg word
> + * moving forward.
> + */

Why move this comment?

>  static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
>  {
>      if (likely(g)) {
> @@ -73,6 +73,28 @@ static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, 
> uint32_t flags)
>      return flags;
>  }
>
> +/* This is an iterative function, called for each Pd and Pg word
> + * moving backward.
> + */
> +static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
> +{
> +    if (likely(g)) {
> +        /* Compute C from first (i.e last) !(D & G).
> +           Use bit 2 to signal first G bit seen.  */
> +        if (!(flags & 4)) {
> +            flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
> +            flags |= (d & pow2floor(g)) == 0;
> +        }
> +
> +        /* Accumulate Z from each D & G.  */
> +        flags |= ((d & g) != 0) << 1;
> +
> +        /* Compute N from last (i.e first) D & G.  Replace previous.  */
> +        flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
> +    }
> +    return flags;
> +}
> +
>  /* The same for a single word predicate.  */
>  uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
>  {
> @@ -2180,3 +2202,168 @@ void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void 
> *vm,
>          d[i] = (pg[H1(i)] & 1 ? nn : mm);
>      }
>  }
> +
> +/* Two operand comparison controlled by a predicate.
> + * ??? It is very tempting to want to be able to expand this inline
> + * with x86 instructions, e.g.
> + *
> + *    vcmpeqw    zm, zn, %ymm0
> + *    vpmovmskb  %ymm0, %eax
> + *    and        $0x5555, %eax
> + *    and        pg, %eax
> + *
> + * or even aarch64, e.g.
> + *
> + *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
> + *    cmeq       v0.8h, zn, zm
> + *    and        v0.8h, v0.8h, mask
> + *    addv       h0, v0.8h
> + *    and        v0.8b, pg
> + *
> + * However, coming up with an abstraction that allows vector inputs and
> + * a scalar output, and also handles the byte-ordering of sub-uint64_t
> + * scalar outputs, is tricky.
> + */
> +#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 
> \
> +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 
> \
> +{                                                                            
> \
> +    intptr_t opr_sz = simd_oprsz(desc);                                      
> \
> +    uint32_t flags = PREDTEST_INIT;                                          
> \
> +    intptr_t i = opr_sz;                                                     
> \
> +    do {                                                                     
> \
> +        uint64_t out = 0, pg;                                                
> \
> +        do {                                                                 
> \
> +            i -= sizeof(TYPE), out <<= sizeof(TYPE);                         
> \
> +            TYPE nn = *(TYPE *)(vn + H(i));                                  
> \
> +            TYPE mm = *(TYPE *)(vm + H(i));                                  
> \
> +            out |= nn OP mm;                                                 
> \
> +        } while (i & 63);                                                    
> \
> +        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            
> \
> +        out &= pg;                                                           
> \
> +        *(uint64_t *)(vd + (i >> 3)) = out;                                  
> \
> +        flags = iter_predtest_bwd(out, pg, flags);                           
> \
> +    } while (i > 0);                                                         
> \
> +    return flags;                                                            
> \
> +}

Why do we iterate backwards through the vector? As far as I can
see the pseudocode iterates forwards, and I don't think it
makes a difference to the result which way we go.


Otherwise

Reviewed-by: Peter Maydell <peter.mayd...@linaro.org>

thanks
-- PMM

Re: [Qemu-devel] [PATCH v2 36/67] target/arm: Implement SVE Integer Compare - Vectors Group

Reply via email to