Given the following loop: int a[N]; short b[N*2];
for (int i = 0; i < N; ++i) a[i] = b[i*2]; After being vectorized, the access to b[i*2] will be compiled into several packing statements, while the type promotion from short to int will be compiled into several unpacking statements. With this patch, each pair of pack/unpack statements will be replaced by less expensive statements (with shift or bit-and operations). On x86_64, the loop above will be compiled into the following assembly (with -O2 -ftree-vectorize): movdqu 0x10(%rcx),%xmm3 movdqu -0x20(%rcx),%xmm0 movdqa %xmm0,%xmm2 punpcklwd %xmm3,%xmm0 punpckhwd %xmm3,%xmm2 movdqa %xmm0,%xmm3 punpcklwd %xmm2,%xmm0 punpckhwd %xmm2,%xmm3 movdqa %xmm1,%xmm2 punpcklwd %xmm3,%xmm0 pcmpgtw %xmm0,%xmm2 movdqa %xmm0,%xmm3 punpckhwd %xmm2,%xmm0 punpcklwd %xmm2,%xmm3 movups %xmm0,-0x10(%rdx) movups %xmm3,-0x20(%rdx) With this patch, the generated assembly is shown below: movdqu 0x10(%rcx),%xmm0 movdqu -0x20(%rcx),%xmm1 pslld $0x10,%xmm0 psrad $0x10,%xmm0 pslld $0x10,%xmm1 movups %xmm0,-0x10(%rdx) psrad $0x10,%xmm1 movups %xmm1,-0x20(%rdx) Bootstrapped and tested on x86-64. OK for trunk? thanks, Cong
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 117cdd0..e7143f1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2014-04-23 Cong Hou <co...@google.com> + + * tree-vect-stmts.c (detect_pack_unpack_pattern): New function. + (vect_gen_widened_results_half): Call detect_pack_unpack_pattern. + 2014-04-23 David Malcolm <dmalc...@redhat.com> * is-a.h: Update comments to reflect the following changes to the diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 62b07f4..a8755b3 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2014-04-23 Cong Hou <co...@google.com> + + * gcc.dg/vect/vect-125.c: New test. + 2014-04-23 Jeff Law <l...@redhat.com> PR tree-optimization/60902 diff --git a/gcc/testsuite/gcc.dg/vect/vect-125.c b/gcc/testsuite/gcc.dg/vect/vect-125.c new file mode 100644 index 0000000..988dea6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-125.c @@ -0,0 +1,122 @@ +/* { dg-require-effective-target vect_int } */ + +#include <limits.h> +#include "tree-vect.h" + +#define N 64 + +char b[N]; +unsigned char c[N]; +short d[N]; +unsigned short e[N]; + +__attribute__((noinline)) void +test1 () +{ + int a[N]; + int i; + for (i = 0; i < N/2; i++) + { + a[i] = b[i*2]; + a[i+N/2] = b[i*2+1]; + } + for (i = 0; i < N/2; i++) + if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1]) + abort (); + + for (i = 0; i < N/2; i++) + { + a[i] = c[i*2]; + a[i+N/2] = c[i*2+1]; + } + for (i = 0; i < N/2; i++) + if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1]) + abort (); + + for (i = 0; i < N/2; i++) + { + a[i] = d[i*2]; + a[i+N/2] = d[i*2+1]; + } + for (i = 0; i < N/2; i++) + if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1]) + abort (); + + for (i = 0; i < N/2; i++) + { + a[i] = e[i*2]; + a[i+N/2] = e[i*2+1]; + } + for (i = 0; i < N/2; i++) + if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1]) + abort (); +} + +__attribute__((noinline)) void +test2 () +{ + unsigned int a[N]; + int i; + for (i = 0; i < N/2; i++) + { + a[i] = b[i*2]; + a[i+N/2] = b[i*2+1]; + } + for (i = 0; i < N/2; i++) + if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1]) + abort (); + + for (i = 0; i < N/2; i++) + { + a[i] = c[i*2]; + a[i+N/2] = c[i*2+1]; + } + for (i = 0; i < N/2; i++) + if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1]) + abort (); + + for (i = 0; i < N/2; i++) + { + a[i] = d[i*2]; + a[i+N/2] = d[i*2+1]; + } + for (i = 0; i < N/2; i++) + if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1]) + abort (); + + for (i = 0; i < N/2; i++) + { + a[i] = e[i*2]; + a[i+N/2] = e[i*2+1]; + } + for (i = 0; i < N/2; i++) + if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1]) + abort (); +} + +int +main () +{ + b[0] = CHAR_MIN; + c[0] = UCHAR_MAX; + d[0] = SHRT_MIN; + e[0] = USHRT_MAX; + + int i; + for (i = 1; i < N; i++) + { + b[i] = b[i-1] + 1; + c[i] = c[i-1] - 1; + d[i] = d[i-1] + 1; + e[i] = e[i-1] - 1; + } + + test1 (); + test2 (); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 2 "vect" } } */ +/* { dg-final { scan-tree-dump-times "A pack-unpack pattern is recognized" 32 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 1a51d6d..d0cf1f4 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -3191,6 +3191,174 @@ vectorizable_simd_clone_call (gimple stmt, gimple_stmt_iterator *gsi, } +/* Function detect_pack_unpack_pattern + + Detect the following pattern: + + S1 vect3 = VEC_PERM_EXPR <vect1, vect2, { 0, 2, 4, ... }>; + or + S1 vect3 = VEC_PERM_EXPR <vect1, vect2, { 1, 3, 5, ... }>; + + S2 vect4 = [vec_unpack_lo_expr] vect3; + or/and + S3 vect5 = [vec_unpack_hi_expr] vect3; + + S1 is usually generated from accessing even/odd elements of an array + and S2/S3 are generated from type promotion. In this case S1 and S2/S3 can + be replaced by less expensive statements. */ + +static gimple +detect_pack_unpack_pattern (enum tree_code code, tree vec_oprnd, int op_type, + tree vec_dest, gimple_stmt_iterator *gsi, + gimple stmt) +{ + if ((code != VEC_UNPACK_LO_EXPR && code != VEC_UNPACK_HI_EXPR) + || op_type != unary_op) + return NULL; + + if (TREE_CODE (vec_oprnd) != SSA_NAME) + return NULL; + + gimple def_stmt = SSA_NAME_DEF_STMT (vec_oprnd); + if (!is_gimple_assign (def_stmt)) + return NULL; + enum tree_code def_code = gimple_assign_rhs_code (def_stmt); + if (def_code != VEC_PERM_EXPR) + return NULL; + + bool unpack_lo = (code == VEC_UNPACK_LO_EXPR); + tree perm_vec = gimple_assign_rhs3 (def_stmt); + gcc_assert (TREE_CODE (perm_vec) == VECTOR_CST); + + /* Check if VEC_PERM_EXPR is generated from accessing even/odd elements of + an array. */ + int nelts = VECTOR_CST_NELTS (perm_vec); + int odd_num = 0; + for (int i = 0; i < nelts / 2; i++) + { + tree elt = VECTOR_CST_ELT (perm_vec, + unpack_lo ? i : i + nelts / 2); + int elt_val = (HOST_WIDE_INT) TREE_INT_CST_LOW (elt); + if (!unpack_lo) + elt_val -= nelts; + + if (elt_val / 2 != i) + return NULL; + odd_num += elt_val & 1; + } + + bool even_elem = false; + if (odd_num == 0) + even_elem = true; + else if (odd_num != nelts / 2) + return NULL; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "A pack-unpack pattern is recognized.\n"); + + tree vec_oprnd0 = unpack_lo ? gimple_assign_rhs1 (def_stmt) + : gimple_assign_rhs2 (def_stmt); + tree vec_type = TREE_TYPE (vec_oprnd0); + tree scalar_type = lang_hooks.types.type_for_mode ( + GET_MODE_INNER (TYPE_MODE (vec_type)), + TYPE_UNSIGNED (vec_type)); + tree vectype_out = TREE_TYPE (vec_dest); + + if (even_elem) + { + /* If even elements are packed, and if the original values are unsigned, + build a bit_and statement to replace the pack-unpack statements. + If the original values are signed, build a lshift statement and a + rshift statement to replace the pack-unpack statements. */ + + if (TYPE_UNSIGNED (vec_type)) + { + tree *mask = XALLOCAVEC (tree, nelts); + unsigned HOST_WIDE_INT max_val = TREE_INT_CST_LOW ( + TYPE_MAX_VALUE (scalar_type)); + for (int k = 0; k < nelts; ++k) + mask[k] = build_int_cst (scalar_type, (k & 1) ? 0 : max_val); + tree vec_oprnd1 = build_vector (TREE_TYPE (vec_oprnd0), mask); + tree temp = make_temp_ssa_name (vec_type, NULL, "vect_temp"); + + gimple new_stmt1, new_stmt2; + new_stmt1 = gimple_build_assign_with_ops (BIT_AND_EXPR, temp, + vec_oprnd0, vec_oprnd1); + new_stmt2 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest, + temp, NULL_TREE); + gimple_assign_set_lhs (new_stmt2, + make_ssa_name (vec_dest, new_stmt2)); + + vect_finish_stmt_generation (stmt, new_stmt1, gsi); + vect_finish_stmt_generation (stmt, new_stmt2, gsi); + + return new_stmt2; + } + else + { + tree scalar_itype = lang_hooks.types.type_for_mode ( + GET_MODE_INNER (TYPE_MODE (vectype_out)), 0); + tree vecitype = get_vectype_for_scalar_type (scalar_itype); + tree shift_oprnd = build_int_cst (scalar_type, + TYPE_PRECISION (scalar_type)); + + tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp"); + tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp"); + tree temp3 = make_temp_ssa_name (vecitype, NULL, "vect_temp"); + + gimple new_stmt1, new_stmt2, new_stmt3, new_stmt4; + new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1, + vec_oprnd0, NULL); + new_stmt2 = gimple_build_assign_with_ops (LSHIFT_EXPR, temp2, + temp1, shift_oprnd); + new_stmt3 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp3, + temp2, shift_oprnd); + new_stmt4 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest, + temp3, NULL); + gimple_assign_set_lhs (new_stmt4, + make_ssa_name (vec_dest, new_stmt4)); + + vect_finish_stmt_generation (stmt, new_stmt1, gsi); + vect_finish_stmt_generation (stmt, new_stmt2, gsi); + vect_finish_stmt_generation (stmt, new_stmt3, gsi); + vect_finish_stmt_generation (stmt, new_stmt4, gsi); + + return new_stmt4; + } + } + else + { + /* If odd elements are packed, build a rshift statement to replace + the pack-unpack statements. */ + + gimple new_stmt1, new_stmt2, new_stmt3; + tree scalar_itype = lang_hooks.types.type_for_mode ( + GET_MODE_INNER (TYPE_MODE (vectype_out)), + TYPE_UNSIGNED (scalar_type)); + tree vecitype = get_vectype_for_scalar_type (scalar_itype); + tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp"); + tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp"); + tree shift_oprnd = build_int_cst (scalar_type, + TYPE_PRECISION (scalar_type)); + + new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1, + vec_oprnd0, NULL); + new_stmt2 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp2, + temp1, shift_oprnd); + new_stmt3 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest, + temp2, NULL); + gimple_assign_set_lhs (new_stmt3, + make_ssa_name (vec_dest, new_stmt2)); + + vect_finish_stmt_generation (stmt, new_stmt1, gsi); + vect_finish_stmt_generation (stmt, new_stmt2, gsi); + vect_finish_stmt_generation (stmt, new_stmt3, gsi); + + return new_stmt3; + } +} + /* Function vect_gen_widened_results_half Create a vector stmt whose code, type, number of arguments, and result @@ -3223,10 +3391,15 @@ vect_gen_widened_results_half (enum tree_code code, } else { + if ((new_stmt = detect_pack_unpack_pattern ( + code, vec_oprnd0, op_type, vec_dest, gsi, stmt))) + return new_stmt; + /* Generic support */ gcc_assert (op_type == TREE_CODE_LENGTH (code)); if (op_type != binary_op) vec_oprnd1 = NULL; + new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0, vec_oprnd1); new_temp = make_ssa_name (vec_dest, new_stmt);