On 17 February 2018 at 18:22, Richard Henderson <richard.hender...@linaro.org> wrote: > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > ---
> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c > index 86cd792cdf..ae433861f8 100644 > --- a/target/arm/sve_helper.c > +++ b/target/arm/sve_helper.c > @@ -46,14 +46,14 @@ > * > * The return value has bit 31 set if N is set, bit 1 set if Z is clear, > * and bit 0 set if C is set. > - * > - * This is an iterative function, called for each Pd and Pg word > - * moving forward. > */ > > /* For no G bits set, NZCV = C. */ > #define PREDTEST_INIT 1 > > +/* This is an iterative function, called for each Pd and Pg word > + * moving forward. > + */ Why move this comment? > static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) > { > if (likely(g)) { > @@ -73,6 +73,28 @@ static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, > uint32_t flags) > return flags; > } > > +/* This is an iterative function, called for each Pd and Pg word > + * moving backward. > + */ > +static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) > +{ > + if (likely(g)) { > + /* Compute C from first (i.e last) !(D & G). > + Use bit 2 to signal first G bit seen. */ > + if (!(flags & 4)) { > + flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ > + flags |= (d & pow2floor(g)) == 0; > + } > + > + /* Accumulate Z from each D & G. */ > + flags |= ((d & g) != 0) << 1; > + > + /* Compute N from last (i.e first) D & G. Replace previous. */ > + flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); > + } > + return flags; > +} > + > /* The same for a single word predicate. */ > uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) > { > @@ -2180,3 +2202,168 @@ void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void > *vm, > d[i] = (pg[H1(i)] & 1 ? nn : mm); > } > } > + > +/* Two operand comparison controlled by a predicate. > + * ??? It is very tempting to want to be able to expand this inline > + * with x86 instructions, e.g. > + * > + * vcmpeqw zm, zn, %ymm0 > + * vpmovmskb %ymm0, %eax > + * and $0x5555, %eax > + * and pg, %eax > + * > + * or even aarch64, e.g. > + * > + * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 > + * cmeq v0.8h, zn, zm > + * and v0.8h, v0.8h, mask > + * addv h0, v0.8h > + * and v0.8b, pg > + * > + * However, coming up with an abstraction that allows vector inputs and > + * a scalar output, and also handles the byte-ordering of sub-uint64_t > + * scalar outputs, is tricky. > + */ > +#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) > \ > +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) > \ > +{ > \ > + intptr_t opr_sz = simd_oprsz(desc); > \ > + uint32_t flags = PREDTEST_INIT; > \ > + intptr_t i = opr_sz; > \ > + do { > \ > + uint64_t out = 0, pg; > \ > + do { > \ > + i -= sizeof(TYPE), out <<= sizeof(TYPE); > \ > + TYPE nn = *(TYPE *)(vn + H(i)); > \ > + TYPE mm = *(TYPE *)(vm + H(i)); > \ > + out |= nn OP mm; > \ > + } while (i & 63); > \ > + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; > \ > + out &= pg; > \ > + *(uint64_t *)(vd + (i >> 3)) = out; > \ > + flags = iter_predtest_bwd(out, pg, flags); > \ > + } while (i > 0); > \ > + return flags; > \ > +} Why do we iterate backwards through the vector? As far as I can see the pseudocode iterates forwards, and I don't think it makes a difference to the result which way we go. Otherwise Reviewed-by: Peter Maydell <peter.mayd...@linaro.org> thanks -- PMM