Hi Jakub,
For 401.bzip2 it looks perfect. This is loop is vectorized:
.L6:
vmovdqa (%rsi,%rax), %ymm0
addl $1, %ecx
vpsrad $8, %ymm0, %ymm0
vpsrld $31, %ymm0, %ymm1
vpaddd %ymm1, %ymm0, %ymm0
vpsrad $1, %ymm0, %ymm0
vpaddd %ymm2, %ymm0, %ymm0
vpslld $8, %ymm0, %ymm0
vmovdqa %ymm0, (%rsi,%rax)
addq $32, %rax
cmpl %r8d, %ecx
jne .L3
Unfortunatelly, I have no perf. simulator of AVX2. But I believe it
will benefit since, in absence of that optimization we got (same
options):
.L3:
movl (%rax), %edx
sarl $8, %edx
movl %edx, %ecx
shrl $31, %ecx
addl %ecx, %edx
sarl %edx
addl $1, %edx
sall $8, %edx
movl %edx, (%rax)
addq $4, %rax
cmpq %rsi, %rax
jne .L3
Thanks, K
On Wed, Dec 14, 2011 at 4:25 PM, Jakub Jelinek <[email protected]> wrote:
> On Tue, Dec 13, 2011 at 05:57:40PM +0400, Kirill Yukhin wrote:
>> > Let me hack up a quick pattern recognizer for this...
>
> Here it is, untested so far.
> On the testcase doing 2000000 f1+f2+f3+f4 calls in the loop with -O3 -mavx
> on Sandybridge (so, vectorized just with 16 byte vectors) gives:
> vanilla 0m34.571s
> the tree-vect* parts of this patch only 0m9.013s
> the whole patch 0m8.824s
> The i386 parts are just a small optimization, I guess it could be
> done in the vectorizer too (but then we'd have to check whether the
> arithmetic/logical right shifts are supported and check costs?), or
> perhaps in the generic vcond expander (again, we'd need to check some
> costs).
>
> Can you see if it gives a measurable speedup on SPEC bzip2?
>
> 2011-12-14 Jakub Jelinek <[email protected]>
>
> * tree-vectorizer.h (NUM_PATTERNS): Bump to 10.
> * tree-vect-patterns.c (vect_recog_sdivmod_pow2_pattern): New
> function.
> (vect_vect_recog_func_ptrs): Add it.
>
> * config/i386/sse.md (vcond<V_256:mode><VI_256:mode>,
> vcond<V_128:mode><VI124_128:mode>, vcond<VI8F_128:mode>v2di):
> Use general_operand instead of nonimmediate_operand for
> operand 5 and no predicate for operands 1 and 2.
> * config/i386/i386.c (ix86_expand_int_vcond): Optimize
> x < 0 ? -1 : 0 and x < 0 ? 1 : 0 into vector arithmetic
> resp. logical shift.
>
> * gcc.dg/vect/vect-sdivmod-1.c: New test.
>
> --- gcc/tree-vectorizer.h.jj 2011-12-14 08:11:03.592010101 +0100
> +++ gcc/tree-vectorizer.h 2011-12-14 08:28:57.006693371 +0100
> @@ -929,7 +929,7 @@ extern void vect_slp_transform_bb (basic
> Additional pattern recognition functions can (and will) be added
> in the future. */
> typedef gimple (* vect_recog_func_ptr) (VEC (gimple, heap) **, tree *, tree
> *);
> -#define NUM_PATTERNS 9
> +#define NUM_PATTERNS 10
> void vect_pattern_recog (loop_vec_info);
>
> /* In tree-vectorizer.c. */
> --- gcc/tree-vect-patterns.c.jj 2011-12-02 01:52:27.918883940 +0100
> +++ gcc/tree-vect-patterns.c 2011-12-14 11:50:10.439763740 +0100
> @@ -53,6 +53,8 @@ static gimple vect_recog_widen_shift_pat
> tree *, tree *);
> static gimple vect_recog_vector_vector_shift_pattern (VEC (gimple, heap) **,
> tree *, tree *);
> +static gimple vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **,
> + tree *, tree *);
> static gimple vect_recog_mixed_size_cond_pattern (VEC (gimple, heap) **,
> tree *, tree *);
> static gimple vect_recog_bool_pattern (VEC (gimple, heap) **, tree *, tree
> *);
> @@ -64,6 +66,7 @@ static vect_recog_func_ptr vect_vect_rec
> vect_recog_over_widening_pattern,
> vect_recog_widen_shift_pattern,
> vect_recog_vector_vector_shift_pattern,
> + vect_recog_sdivmod_pow2_pattern,
> vect_recog_mixed_size_cond_pattern,
> vect_recog_bool_pattern};
>
> @@ -1573,6 +1576,211 @@ vect_recog_vector_vector_shift_pattern (
> return pattern_stmt;
> }
>
> +/* Detect a signed division by power of two constant that wouldn't be
> + otherwise vectorized:
> +
> + type a_t, b_t;
> +
> + S1 a_t = b_t / N;
> +
> + where type 'type' is a signed integral type and N is a constant positive
> + power of two.
> +
> + Similarly handle signed modulo by power of two constant:
> +
> + S4 a_t = b_t % N;
> +
> + Input/Output:
> +
> + * STMTS: Contains a stmt from which the pattern search begins,
> + i.e. the division stmt. S1 is replaced by:
> + S3 y_t = b_t < 0 ? N - 1 : 0;
> + S2 x_t = b_t + y_t;
> + S1' a_t = x_t >> log2 (N);
> +
> + S4 is replaced by (where *_T temporaries have unsigned type):
> + S9 y_T = b_t < 0 ? -1U : 0U;
> + S8 z_T = y_T >> (sizeof (type_t) * CHAR_BIT - log2 (N));
> + S7 z_t = (type) z_T;
> + S6 w_t = b_t + z_t;
> + S5 x_t = w_t & (N - 1);
> + S4' a_t = x_t - z_t;
> +
> + Output:
> +
> + * TYPE_IN: The type of the input arguments to the pattern.
> +
> + * TYPE_OUT: The type of the output of this pattern.
> +
> + * Return value: A new stmt that will be used to replace the division
> + S1 or modulo S4 stmt. */
> +
> +static gimple
> +vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **stmts,
> + tree *type_in, tree *type_out)
> +{
> + gimple last_stmt = VEC_pop (gimple, *stmts);
> + gimple_stmt_iterator gsi;
> + tree oprnd0, oprnd1, vectype, itype, cond;
> + gimple pattern_stmt, def_stmt;
> + enum tree_code rhs_code;
> + stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
> + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
> + optab optab;
> +
> + if (!is_gimple_assign (last_stmt))
> + return NULL;
> +
> + rhs_code = gimple_assign_rhs_code (last_stmt);
> + switch (rhs_code)
> + {
> + case TRUNC_DIV_EXPR:
> + case TRUNC_MOD_EXPR:
> + break;
> + default:
> + return NULL;
> + }
> +
> + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
> + return NULL;
> +
> + oprnd0 = gimple_assign_rhs1 (last_stmt);
> + oprnd1 = gimple_assign_rhs2 (last_stmt);
> + itype = TREE_TYPE (oprnd0);
> + if (TREE_CODE (oprnd0) != SSA_NAME
> + || TREE_CODE (oprnd1) != INTEGER_CST
> + || TREE_CODE (itype) != INTEGER_TYPE
> + || TYPE_UNSIGNED (itype)
> + || TYPE_PRECISION (itype) != GET_MODE_PRECISION (TYPE_MODE (itype))
> + || !integer_pow2p (oprnd1)
> + || tree_int_cst_sgn (oprnd1) != 1)
> + return NULL;
> +
> + vectype = get_vectype_for_scalar_type (itype);
> + if (vectype == NULL_TREE)
> + return NULL;
> +
> + /* If the target can handle vectorized division or modulo natively,
> + don't attempt to optimize this. */
> + optab = optab_for_tree_code (rhs_code, vectype, optab_default);
> + if (optab != NULL)
> + {
> + enum machine_mode vec_mode = TYPE_MODE (vectype);
> + int icode = (int) optab_handler (optab, vec_mode);
> + if (icode != CODE_FOR_nothing
> + || GET_MODE_SIZE (vec_mode) == UNITS_PER_WORD)
> + return NULL;
> + }
> +
> + /* Pattern detected. */
> + if (vect_print_dump_info (REPORT_DETAILS))
> + fprintf (vect_dump, "vect_recog_sdivmod_pow2_pattern: detected: ");
> +
> + cond = build2 (LT_EXPR, boolean_type_node, oprnd0, build_int_cst (itype,
> 0));
> + gsi = gsi_for_stmt (last_stmt);
> + if (rhs_code == TRUNC_DIV_EXPR)
> + {
> + tree var = vect_recog_temp_ssa_var (itype, NULL);
> + def_stmt
> + = gimple_build_assign_with_ops3 (COND_EXPR, var, cond,
> + fold_build2 (MINUS_EXPR, itype,
> + oprnd1,
> + build_int_cst (itype,
> + 1)),
> + build_int_cst (itype, 0));
> + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> + set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo,
> + NULL));
> + var = vect_recog_temp_ssa_var (itype, NULL);
> + def_stmt
> + = gimple_build_assign_with_ops (PLUS_EXPR, var, oprnd0,
> + gimple_assign_lhs (def_stmt));
> + STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt;
> +
> + pattern_stmt
> + = gimple_build_assign_with_ops (RSHIFT_EXPR,
> + vect_recog_temp_ssa_var (itype, NULL),
> + var,
> + build_int_cst (itype,
> + tree_log2 (oprnd1)));
> + }
> + else
> + {
> + tree signmask;
> + tree utype = build_nonstandard_integer_type (TYPE_PRECISION (itype),
> 1);
> + tree shift = build_int_cst (utype, GET_MODE_BITSIZE (TYPE_MODE (itype))
> + - tree_log2 (oprnd1));
> + if (compare_tree_int (oprnd1, 2) == 0)
> + {
> + signmask = vect_recog_temp_ssa_var (itype, NULL);
> + def_stmt
> + = gimple_build_assign_with_ops3 (COND_EXPR, signmask, cond,
> + build_int_cst (itype, 1),
> + build_int_cst (itype, 0));
> + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> + set_vinfo_for_stmt (def_stmt,
> + new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
> + }
> + else
> + {
> + tree var = vect_recog_temp_ssa_var (utype, NULL);
> + def_stmt
> + = gimple_build_assign_with_ops3 (COND_EXPR, var, cond,
> + build_int_cst (utype, -1),
> + build_int_cst (utype, 0));
> + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> + set_vinfo_for_stmt (def_stmt,
> + new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
> + var = vect_recog_temp_ssa_var (utype, NULL);
> + def_stmt
> + = gimple_build_assign_with_ops (RSHIFT_EXPR, var,
> + gimple_assign_lhs (def_stmt),
> + shift);
> + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> + set_vinfo_for_stmt (def_stmt,
> + new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
> + signmask = vect_recog_temp_ssa_var (itype, NULL);
> + def_stmt
> + = gimple_build_assign_with_ops (NOP_EXPR, signmask, var,
> + NULL_TREE);
> + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> + set_vinfo_for_stmt (def_stmt,
> + new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
> + }
> + def_stmt
> + = gimple_build_assign_with_ops (PLUS_EXPR,
> + vect_recog_temp_ssa_var (itype, NULL),
> + oprnd0, signmask);
> + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> + set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo,
> + NULL));
> + def_stmt
> + = gimple_build_assign_with_ops (BIT_AND_EXPR,
> + vect_recog_temp_ssa_var (itype, NULL),
> + gimple_assign_lhs (def_stmt),
> + fold_build2 (MINUS_EXPR, itype,
> + oprnd1,
> + build_int_cst (itype,
> + 1)));
> + STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt;
> +
> + pattern_stmt
> + = gimple_build_assign_with_ops (MINUS_EXPR,
> + vect_recog_temp_ssa_var (itype, NULL),
> + gimple_assign_lhs (def_stmt),
> + signmask);
> + }
> +
> + if (vect_print_dump_info (REPORT_DETAILS))
> + print_gimple_stmt (vect_dump, pattern_stmt, 0, TDF_SLIM);
> +
> + VEC_safe_push (gimple, heap, *stmts, last_stmt);
> +
> + *type_in = vectype;
> + *type_out = vectype;
> + return pattern_stmt;
> +}
> +
> /* Function vect_recog_mixed_size_cond_pattern
>
> Try to find the following pattern:
> --- gcc/config/i386/sse.md.jj 2011-12-03 12:22:33.000000000 +0100
> +++ gcc/config/i386/sse.md 2011-12-14 13:00:47.695788256 +0100
> @@ -6340,9 +6340,9 @@ (define_expand "vcond<V_256:mode><VI_256
> (if_then_else:V_256
> (match_operator 3 ""
> [(match_operand:VI_256 4 "nonimmediate_operand" "")
> - (match_operand:VI_256 5 "nonimmediate_operand" "")])
> - (match_operand:V_256 1 "general_operand" "")
> - (match_operand:V_256 2 "general_operand" "")))]
> + (match_operand:VI_256 5 "general_operand" "")])
> + (match_operand:V_256 1 "" "")
> + (match_operand:V_256 2 "" "")))]
> "TARGET_AVX2
> && (GET_MODE_NUNITS (<V_256:MODE>mode)
> == GET_MODE_NUNITS (<VI_256:MODE>mode))"
> @@ -6357,9 +6357,9 @@ (define_expand "vcond<V_128:mode><VI124_
> (if_then_else:V_128
> (match_operator 3 ""
> [(match_operand:VI124_128 4 "nonimmediate_operand" "")
> - (match_operand:VI124_128 5 "nonimmediate_operand" "")])
> - (match_operand:V_128 1 "general_operand" "")
> - (match_operand:V_128 2 "general_operand" "")))]
> + (match_operand:VI124_128 5 "general_operand" "")])
> + (match_operand:V_128 1 "" "")
> + (match_operand:V_128 2 "" "")))]
> "TARGET_SSE2
> && (GET_MODE_NUNITS (<V_128:MODE>mode)
> == GET_MODE_NUNITS (<VI124_128:MODE>mode))"
> @@ -6374,9 +6374,9 @@ (define_expand "vcond<VI8F_128:mode>v2di
> (if_then_else:VI8F_128
> (match_operator 3 ""
> [(match_operand:V2DI 4 "nonimmediate_operand" "")
> - (match_operand:V2DI 5 "nonimmediate_operand" "")])
> - (match_operand:VI8F_128 1 "general_operand" "")
> - (match_operand:VI8F_128 2 "general_operand" "")))]
> + (match_operand:V2DI 5 "general_operand" "")])
> + (match_operand:VI8F_128 1 "" "")
> + (match_operand:VI8F_128 2 "" "")))]
> "TARGET_SSE4_2"
> {
> bool ok = ix86_expand_int_vcond (operands);
> --- gcc/config/i386/i386.c.jj 2011-12-14 08:11:01.000000000 +0100
> +++ gcc/config/i386/i386.c 2011-12-14 13:01:30.207538465 +0100
> @@ -19434,6 +19434,45 @@ ix86_expand_int_vcond (rtx operands[])
> cop0 = operands[4];
> cop1 = operands[5];
>
> + /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
> + and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
> + if ((code == LT || code == GE)
> + && data_mode == mode
> + && cop1 == CONST0_RTX (mode)
> + && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
> + && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
> + && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
> + && (GET_MODE_SIZE (data_mode) == 16
> + || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
> + {
> + rtx negop = operands[2 - (code == LT)];
> + int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
> + if (negop == CONST1_RTX (data_mode))
> + {
> + rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT
> (shift),
> + operands[0], 1, OPTAB_DIRECT);
> + if (res != operands[0])
> + emit_move_insn (operands[0], res);
> + return true;
> + }
> + else if (GET_MODE_INNER (data_mode) != DImode
> + && vector_all_ones_operand (negop, data_mode))
> + {
> + rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT
> (shift),
> + operands[0], 0, OPTAB_DIRECT);
> + if (res != operands[0])
> + emit_move_insn (operands[0], res);
> + return true;
> + }
> + }
> +
> + if (!nonimmediate_operand (cop1, mode))
> + cop1 = force_reg (mode, cop1);
> + if (!general_operand (operands[1], data_mode))
> + operands[1] = force_reg (data_mode, operands[1]);
> + if (!general_operand (operands[2], data_mode))
> + operands[2] = force_reg (data_mode, operands[2]);
> +
> /* XOP supports all of the comparisons on all 128-bit vector int types. */
> if (TARGET_XOP
> && (mode == V16QImode || mode == V8HImode
> --- gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c.jj 2011-12-14
> 13:05:36.240133846 +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c 2011-12-14 13:08:10.288228745
> +0100
> @@ -0,0 +1,98 @@
> +#include "tree-vect.h"
> +
> +extern void abort (void);
> +int a[4096];
> +
> +__attribute__((noinline, noclone)) void
> +f1 (int x)
> +{
> + int i, j;
> + for (i = 1; i <= x; i++)
> + {
> + j = a[i] >> 8;
> + j = 1 + (j / 2);
> + a[i] = j << 8;
> + }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (int x)
> +{
> + int i, j;
> + for (i = 1; i <= x; i++)
> + {
> + j = a[i] >> 8;
> + j = 1 + (j / 16);
> + a[i] = j << 8;
> + }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (int x)
> +{
> + int i, j;
> + for (i = 1; i <= x; i++)
> + {
> + j = a[i] >> 8;
> + j = 1 + (j % 2);
> + a[i] = j << 8;
> + }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (int x)
> +{
> + int i, j;
> + for (i = 1; i <= x; i++)
> + {
> + j = a[i] >> 8;
> + j = 1 + (j % 16);
> + a[i] = j << 8;
> + }
> +}
> +
> +int
> +main ()
> +{
> + int i;
> + check_vect ();
> + for (i = 0; i < 4096; i++)
> + {
> + asm ("");
> + a[i] = (i - 2048) << 8;
> + }
> + f1 (4095);
> + if (a[0] != (-2048 << 8))
> + abort ();
> + for (i = 1; i < 4096; i++)
> + if (a[i] != ((1 + ((i - 2048) / 2)) << 8))
> + abort ();
> + else
> + a[i] = (i - 2048) << 8;
> + f2 (4095);
> + if (a[0] != (-2048 << 8))
> + abort ();
> + for (i = 1; i < 4096; i++)
> + if (a[i] != ((1 + ((i - 2048) / 16)) << 8))
> + abort ();
> + else
> + a[i] = (i - 2048) << 8;
> + f3 (4095);
> + if (a[0] != (-2048 << 8))
> + abort ();
> + for (i = 1; i < 4096; i++)
> + if (a[i] != ((1 + ((i - 2048) % 2)) << 8))
> + abort ();
> + else
> + a[i] = (i - 2048) << 8;
> + f4 (4095);
> + if (a[0] != (-2048 << 8))
> + abort ();
> + for (i = 1; i < 4096; i++)
> + if (a[i] != ((1 + ((i - 2048) % 16)) << 8))
> + abort ();
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" { target
> vect_condition } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
>
> Jakub