Current conditional compare (CCMP) support in GCC aim to optimize short circuit for cascade comparision, given a simple conditional compare candidate:
if (a == 17 || a == 32) it's represented like the following in IR: t0 = a == 17 t1 = a == 32 t2 = t0 || t1 Normally, CCMP contains two parts, the first part calculate the initial condition using normal compare instruction like "cmp", then the second part can use "ccmp" instruction and exected conditionally based on the condition generated in the first step. The problem is current implementation always expand t0 first, then t1. While the expand order need to consider the rtx costs, because "cmp" and "ccmp" may have different restrictions that the expand order will result in performance differences. For example on AArch64, "ccmp" only accept immediate within -31 ~ 31 while "cmp" accept wider range, so if we expand "a == 32" in the second step, then it will use "ccmp", and thus an extra "mov reg, 32" instruction is generated because "32" is out of the range. While if we expand "a == 32" first, then it's fine as "cmp" can encoding it. Instruction difference for a simple testcase is listed below: int foo(int a) { if (a == 17 || a == 32) return 1; else return 2; } before === foo: mov w1, 32 cmp w0, 17 ccmp w0, w1, 4, ne cset w0, ne add w0, w0, 1 ret after === foo: cmp w0, 32 ccmp w0, 17, 4, ne cset w0, ne add w0, w0, 1 ret this patch still haven't fix other complexer situations, for example given: if (a == 1 || a = 29 || a == 32) there is recursive call of expand_ccmp_expr_1 while this patch only handle the inner most call where the incoming gimple is with both operands be comparision operations. NOTE: AArch64 backend can't cost CCMP instruction accurately, so I marked the testcase as XFAIL which will be removed once we fix the cost issue. 2015-09-18 Jiong Wang <jiong.w...@arm.com> gcc/ * ccmp.c (expand_ccmp_expr_1): Costs the instruction sequences generated from different expand order. gcc/testsuite/ * gcc.target/aarch64/ccmp_1.c: New testcase. -- Regards, Jiong
diff --git a/gcc/ccmp.c b/gcc/ccmp.c index 3c3fbcd..dc985f6 100644 --- a/gcc/ccmp.c +++ b/gcc/ccmp.c @@ -51,6 +51,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-outof-ssa.h" #include "cfgexpand.h" #include "ccmp.h" +#include "predict.h" /* The following functions expand conditional compare (CCMP) instructions. Here is a short description about the over all algorithm: @@ -165,6 +166,8 @@ expand_ccmp_next (gimple g, enum tree_code code, rtx prev, static rtx expand_ccmp_expr_1 (gimple g, rtx *prep_seq, rtx *gen_seq) { + rtx prep_seq_1, gen_seq_1; + rtx prep_seq_2, gen_seq_2; tree exp = gimple_assign_rhs_to_tree (g); enum tree_code code = TREE_CODE (exp); gimple gs0 = get_gimple_for_ssa_name (TREE_OPERAND (exp, 0)); @@ -180,19 +183,62 @@ expand_ccmp_expr_1 (gimple g, rtx *prep_seq, rtx *gen_seq) { if (TREE_CODE_CLASS (code1) == tcc_comparison) { - int unsignedp0; - enum rtx_code rcode0; + int unsignedp0, unsignedp1; + enum rtx_code rcode0, rcode1; + int speed_p = optimize_insn_for_speed_p (); + rtx tmp2, ret, ret2; + unsigned cost1 = MAX_COST; + unsigned cost2 = MAX_COST; + bool first_only_p = false; + bool second_only_p = false; unsignedp0 = TYPE_UNSIGNED (TREE_TYPE (gimple_assign_rhs1 (gs0))); + unsignedp1 = TYPE_UNSIGNED (TREE_TYPE (gimple_assign_rhs1 (gs1))); rcode0 = get_rtx_code (code0, unsignedp0); - - tmp = targetm.gen_ccmp_first (prep_seq, gen_seq, rcode0, + rcode1 = get_rtx_code (code1, unsignedp1); + tmp = targetm.gen_ccmp_first (&prep_seq_1, &gen_seq_1, rcode0, gimple_assign_rhs1 (gs0), gimple_assign_rhs2 (gs0)); - if (!tmp) - return NULL_RTX; + tmp2 = targetm.gen_ccmp_first (&prep_seq_2, &gen_seq_2, rcode1, + gimple_assign_rhs1 (gs1), + gimple_assign_rhs2 (gs1)); - return expand_ccmp_next (gs1, code, tmp, prep_seq, gen_seq); + if (! tmp && ! tmp2) + return NULL_RTX; + else if (! tmp) + second_only_p = true; + else if (! tmp2) + first_only_p = true; + + + if (! second_only_p) + { + ret = expand_ccmp_next (gs1, code, tmp, &prep_seq_1, &gen_seq_1); + cost1 = seq_cost (safe_as_a <rtx_insn *> (prep_seq_1), + speed_p); + cost1 += seq_cost (safe_as_a <rtx_insn *> (gen_seq_1), speed_p); + } + if (! first_only_p) + { + ret2 = expand_ccmp_next (gs0, code, tmp2, &prep_seq_2, + &gen_seq_2); + cost2 = seq_cost (safe_as_a <rtx_insn *> (prep_seq_2), + speed_p); + cost2 += seq_cost (safe_as_a <rtx_insn *> (gen_seq_2), speed_p); + } + + if (cost1 > cost2) + { + *prep_seq = prep_seq_2; + *gen_seq = gen_seq_2; + ret = ret2; + } + else + { + *prep_seq = prep_seq_1; + *gen_seq = gen_seq_1; + } + return ret; } else { diff --git a/gcc/testsuite/gcc.target/aarch64/ccmp_1.c b/gcc/testsuite/gcc.target/aarch64/ccmp_1.c new file mode 100644 index 0000000..f431af6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ccmp_1.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +int +foo (int a) +{ + if (a == 17 || a == 32) + return 1; + else + return 2; +} + +/* { dg-final { scan-assembler "cmp\t\.*32" { xfail aarch64*-*-* } } } */