Use the carry-out vector as the basis to compute AF, CF and OF. The cost is pretty much the same, because the carry-out is just four boolean operations, and the code is much smaller because add/adc/sub/sbb now share most of it.
A similar algorithm to what is used in target/i386/emulate can also be used for APX, in order to build the result of CCMP/CTEST with a new CC_OP_*. CCMP needs to place into the flags from either a subtraction or a constant value; CTEST likewise place into the flags either an AND or a constant value. The new CC_OP for CCMP and CTEST would store for a successful predcate: - in DST and SRC2, the result of the operation; - in SRC, a carry-out vector for CCMP or zero for CTEST; If the default flag value is used, DST/SRC/SRC2 can be filled with constants: - in DST the negated ZF; - in SRC's top 2 bits, a value that results in the desired OF and CF; - in SRC2 a suitable value (any of 0/1/~0/~1) that can be used instead of DST to compute the desired SF and PF. Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> --- target/i386/cpu.h | 25 ++++++++ target/i386/tcg/cc_helper_template.h.inc | 80 ++++++++---------------- target/i386/hvf/x86_flags.c | 25 -------- 3 files changed, 52 insertions(+), 78 deletions(-) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 76f24446a55..7a8d695bdb1 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2843,4 +2843,29 @@ static inline bool ctl_has_irq(CPUX86State *env) # define TARGET_VSYSCALL_PAGE (UINT64_C(-10) << 20) #endif +/* majority(NOT a, b, c) = (a ^ b) ? b : c */ +#define MAJ_INV1(a, b, c) ((((a) ^ (b)) & ((b) ^ (c))) ^ (c)) + +/* + * ADD_COUT_VEC(x, y) = majority((x + y) ^ x ^ y, x, y) + * + * If two corresponding bits in x and y are the same, that's the carry + * independent of the value (x+y)^x^y. Hence x^y can be replaced with + * 1 in (x+y)^x^y, resulting in majority(NOT (x+y), x, y) + */ +#define ADD_COUT_VEC(op1, op2, result) \ + MAJ_INV1(result, op1, op2) + +/* + * SUB_COUT_VEC(x, y) = NOT majority(x, NOT y, (x - y) ^ x ^ NOT y) + * = majority(NOT x, y, (x - y) ^ x ^ y) + * + * Note that the carry out is actually a borrow, i.e. it is inverted. + * If two corresponding bits in x and y are different, the value of the + * bit in (x-y)^x^y likewise does not matter. Hence, x^y can be replaced + * with 0 in (x-y)^x^y, resulting in majority(NOT x, y, x-y) + */ +#define SUB_COUT_VEC(op1, op2, result) \ + MAJ_INV1(op1, op2, result) + #endif /* I386_CPU_H */ diff --git a/target/i386/tcg/cc_helper_template.h.inc b/target/i386/tcg/cc_helper_template.h.inc index b821e5bca3b..d8fd976ca15 100644 --- a/target/i386/tcg/cc_helper_template.h.inc +++ b/target/i386/tcg/cc_helper_template.h.inc @@ -44,18 +44,32 @@ /* dynamic flags computation */ -static uint32_t glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +static uint32_t glue(compute_all_cout, SUFFIX)(DATA_TYPE dst, DATA_TYPE carries) { - uint32_t cf, pf, af, zf, sf, of; - DATA_TYPE src2 = dst - src1; + uint32_t af_cf, pf, zf, sf, of; - cf = dst < src1; + /* PF, ZF, SF computed from result. */ pf = compute_pf(dst); - af = (dst ^ src1 ^ src2) & CC_A; zf = (dst == 0) * CC_Z; sf = lshift(dst, 8 - DATA_BITS) & CC_S; - of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O; - return cf + pf + af + zf + sf + of; + + /* + * AF, CF, OF computed from carry out vector. To compute AF and CF, rotate it + * left by one so cout(DATA_BITS - 1) is in bit 0 and cout(3) in bit 4. + * + * To compute OF, place the highest two carry bits into OF and the bit + * immediately to the right of it; then, adding CC_O / 2 XORs them. + */ + af_cf = ((carries << 1) | (carries >> (DATA_BITS - 1))) & (CC_A | CC_C); + of = (lshift(carries, 12 - DATA_BITS) + CC_O / 2) & CC_O; + return pf + zf + sf + af_cf + of; +} + +static uint32_t glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +{ + DATA_TYPE src2 = dst - src1; + DATA_TYPE carries = ADD_COUT_VEC(src1, src2, dst); + return glue(compute_all_cout, SUFFIX)(dst, carries); } static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) @@ -66,25 +80,9 @@ static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) static uint32_t glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1, DATA_TYPE src3) { - uint32_t cf, pf, af, zf, sf, of; - -#ifdef WIDER_TYPE - WIDER_TYPE src13 = (WIDER_TYPE) src1 + (WIDER_TYPE) src3; - DATA_TYPE src2 = dst - src13; - - cf = dst < src13; -#else DATA_TYPE src2 = dst - src1 - src3; - - cf = (src3 ? dst <= src1 : dst < src1); -#endif - - pf = compute_pf(dst); - af = (dst ^ src1 ^ src2) & 0x10; - zf = (dst == 0) << 6; - sf = lshift(dst, 8 - DATA_BITS) & 0x80; - of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O; - return cf + pf + af + zf + sf + of; + DATA_TYPE carries = ADD_COUT_VEC(src1, src2, dst); + return glue(compute_all_cout, SUFFIX)(dst, carries); } static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1, @@ -101,16 +99,9 @@ static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1, static uint32_t glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) { - uint32_t cf, pf, af, zf, sf, of; DATA_TYPE src1 = dst + src2; - - cf = src1 < src2; - pf = compute_pf(dst); - af = (dst ^ src1 ^ src2) & CC_A; - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; - of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O; - return cf + pf + af + zf + sf + of; + DATA_TYPE carries = SUB_COUT_VEC(src1, src2, dst); + return glue(compute_all_cout, SUFFIX)(dst, carries); } static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) @@ -123,25 +114,9 @@ static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) static uint32_t glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2, DATA_TYPE src3) { - uint32_t cf, pf, af, zf, sf, of; - -#ifdef WIDER_TYPE - WIDER_TYPE src23 = (WIDER_TYPE) src2 + (WIDER_TYPE) src3; - DATA_TYPE src1 = dst + src23; - - cf = src1 < src23; -#else DATA_TYPE src1 = dst + src2 + src3; - - cf = (src3 ? src1 <= src2 : src1 < src2); -#endif - - pf = compute_pf(dst); - af = (dst ^ src1 ^ src2) & 0x10; - zf = (dst == 0) << 6; - sf = lshift(dst, 8 - DATA_BITS) & 0x80; - of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O; - return cf + pf + af + zf + sf + of; + DATA_TYPE carries = SUB_COUT_VEC(src1, src2, dst); + return glue(compute_all_cout, SUFFIX)(dst, carries); } static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2, @@ -286,6 +261,5 @@ static int glue(compute_c_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) #undef DATA_BITS #undef SIGN_MASK #undef DATA_TYPE -#undef DATA_MASK #undef SUFFIX #undef WIDER_TYPE diff --git a/target/i386/hvf/x86_flags.c b/target/i386/hvf/x86_flags.c index 60ab4f01a20..0c75e0419c3 100644 --- a/target/i386/hvf/x86_flags.c +++ b/target/i386/hvf/x86_flags.c @@ -45,31 +45,6 @@ #define LF_MASK_CF (0x01 << LF_BIT_CF) #define LF_MASK_PO (0x01 << LF_BIT_PO) -/* majority(NOT a, b, c) = (a ^ b) ? b : c */ -#define MAJ_INV1(a, b, c) ((((a) ^ (b)) & ((b) ^ (c))) ^ (c)) - -/* - * ADD_COUT_VEC(x, y) = majority((x + y) ^ x ^ y, x, y) - * - * If two corresponding bits in x and y are the same, that's the carry - * independent of the value (x+y)^x^y. Hence x^y can be replaced with - * 1 in (x+y)^x^y, resulting in majority(NOT (x+y), x, y) - */ -#define ADD_COUT_VEC(op1, op2, result) \ - MAJ_INV1(result, op1, op2) - -/* - * SUB_COUT_VEC(x, y) = NOT majority(x, NOT y, (x - y) ^ x ^ NOT y) - * = majority(NOT x, y, (x - y) ^ x ^ y) - * - * Note that the carry out is actually a borrow, i.e. it is inverted. - * If two corresponding bits in x and y are different, the value of the - * bit in (x-y)^x^y likewise does not matter. Hence, x^y can be replaced - * with 0 in (x-y)^x^y, resulting in majority(NOT x, y, x-y) - */ -#define SUB_COUT_VEC(op1, op2, result) \ - MAJ_INV1(op1, op2, result) - /* ******************* */ /* OSZAPC */ /* ******************* */ -- 2.49.0