Hi The current widen-mult pattern only considers two operands with the same size. However, operands with different sizes can also benefit from this pattern. The following loop shows such an example:
char a[N]; short b[N]; int c[N]; for (int i = 0; i < N; ++i) c[i] = a[i] * b[i]; In this case, we can convert a[i] into short type then perform widen-mult on b[i] and the converted value: for (int i = 0; i < N; ++i) { short t = a[i]; c[i] = t w* b[i]; } This patch adds such support. In addition, the following loop fails to be recognized as a widen-mult pattern because the widening operation from char to int is not directly supported by the target: char a[N], b[N]; int c[N]; for (int i = 0; i < N; ++i) c[i] = a[i] * b[i]; In this case, we can still perform widen-mult on a[i] and b[i], and get a result of short type, then convert it to int: char a[N], b[N]; int c[N]; for (int i = 0; i < N; ++i) { short t = a[i] w* b[i]; c[i] = (int) t; } Currently GCC does not allow multi-step conversions for binary widening operations. This pattern removes this restriction and use VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to arrange data after the widen-mult is performed for the widen-mult pattern. This can reduce several unpacking instructions (for this example, the number of packings/unpackings is reduced from 12 to 8. For SSE2, the inefficient multiplication between two V4SI vectors can also be avoided). Bootstrapped and tested on an x86_64 machine. thanks, Cong diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f298c0b..44ed204 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2013-12-02 Cong Hou <co...@google.com> + + * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Enhance + the widen-mult pattern by handling two operands with different + sizes. + * tree-vect-stmts.c (vectorizable_conversion): Allow multi-steps + conversions after widening mult operation. + (supportable_widening_operation): Likewise. + 2013-11-22 Jakub Jelinek <ja...@redhat.com> PR sanitizer/59061 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 12d2c90..611ae1c 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-12-02 Cong Hou <co...@google.com> + + * gcc.dg/vect/vect-widen-mult-u8-s16-s32.c: New test. + * gcc.dg/vect/vect-widen-mult-u8-u32.c: New test. + 2013-11-22 Jakub Jelinek <ja...@redhat.com> * c-c++-common/asan/no-redundant-instrumentation-7.c: Fix diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c new file mode 100644 index 0000000..9f9081b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c @@ -0,0 +1,48 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 64 + +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +int result[N]; + +/* unsigned char * short -> int widening-mult. */ +__attribute__ ((noinline)) int +foo1(int len) { + int i; + + for (i=0; i<len; i++) { + result[i] = X[i] * Y[i]; + } +} + +int main (void) +{ + int i; + + check_vect (); + + for (i=0; i<N; i++) { + X[i] = i; + Y[i] = 64-i; + __asm__ volatile (""); + } + + foo1 (N); + + for (i=0; i<N; i++) { + if (result[i] != X[i] * Y[i]) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */ +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c new file mode 100644 index 0000000..51e9178 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c @@ -0,0 +1,48 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 64 + +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +unsigned int result[N]; + +/* unsigned char-> unsigned int widening-mult. */ +__attribute__ ((noinline)) int +foo1(int len) { + int i; + + for (i=0; i<len; i++) { + result[i] = X[i] * Y[i]; + } +} + +int main (void) +{ + int i; + + check_vect (); + + for (i=0; i<N; i++) { + X[i] = i; + Y[i] = 64-i; + __asm__ volatile (""); + } + + foo1 (N); + + for (i=0; i<N; i++) { + if (result[i] != X[i] * Y[i]) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */ +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index 7823cc3..06279bc 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -529,7 +529,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum tree_code code, Try to find the following pattern: - type a_t, b_t; + type1 a_t; + type2 b_t; TYPE a_T, b_T, prod_T; S1 a_t = ; @@ -538,11 +539,12 @@ vect_handle_widen_op_by_const (gimple stmt, enum tree_code code, S4 b_T = (TYPE) b_t; S5 prod_T = a_T * b_T; - where type 'TYPE' is at least double the size of type 'type'. + where type 'TYPE' is at least double the size of type 'type1' and 'type2'. Also detect unsigned cases: - unsigned type a_t, b_t; + unsigned type1 a_t; + unsigned type2 b_t; unsigned TYPE u_prod_T; TYPE a_T, b_T, prod_T; @@ -661,6 +663,50 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts, return NULL; } + /* If the two arguments have different sizes, convert the one with + the smaller type into the larger type. */ + if (TYPE_PRECISION (half_type0) != TYPE_PRECISION (half_type1)) + { + tree* oprnd = NULL; + gimple def_stmt = NULL; + + if (TYPE_PRECISION (half_type0) < TYPE_PRECISION (half_type1)) + { + def_stmt = def_stmt0; + half_type0 = half_type1; + oprnd = &oprnd0; + } + else + { + def_stmt = def_stmt1; + half_type1 = half_type0; + oprnd = &oprnd1; + } + + if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt))) + { + gimple new_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)); + /* Check if the already created pattern stmt is what we need. */ + if (!is_gimple_assign (new_stmt) + || gimple_assign_rhs_code (new_stmt) != NOP_EXPR + || TREE_TYPE (gimple_assign_lhs (new_stmt)) != half_type0) + return NULL; + + stmts->safe_push (def_stmt); + *oprnd = gimple_assign_lhs (new_stmt); + } + else + { + tree old_oprnd = gimple_assign_rhs1 (def_stmt); + tree new_oprnd = make_ssa_name (half_type0, NULL); + gimple new_stmt = gimple_build_assign_with_ops (NOP_EXPR, new_oprnd, + old_oprnd, NULL_TREE); + STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)) = new_stmt; + stmts->safe_push (def_stmt); + *oprnd = new_oprnd; + } + } + /* Handle unsigned case. Look for S6 u_prod_T = (unsigned TYPE) prod_T; Use unsigned TYPE as the type for WIDEN_MULT_EXPR. */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 72dfacd..e1ca2a2 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -2504,12 +2504,7 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi, if (supportable_widening_operation (code, stmt, vectype_out, vectype_in, &code1, &code2, &multi_step_cvt, &interm_types)) - { - /* Binary widening operation can only be supported directly by the - architecture. */ - gcc_assert (!(multi_step_cvt && op_type == binary_op)); - break; - } + break; if (code != FLOAT_EXPR || (GET_MODE_SIZE (TYPE_MODE (lhs_type)) @@ -2787,6 +2782,15 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi, c1 = codecvt1; c2 = codecvt2; } + else if (op_type == binary_op && i < multi_step_cvt) + { + /* For binary widening operations, if multi-steps are needed, + use VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to convert + intermediate result to final one. */ + c1 = VEC_UNPACK_LO_EXPR; + c2 = VEC_UNPACK_HI_EXPR; + op_type = unary_op; + } vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1, stmt, this_dest, gsi, @@ -6510,7 +6514,7 @@ supportable_widening_operation (enum tree_code code, gimple stmt, enum tree_code c1, c2; int i; tree prev_type, intermediate_type; - enum machine_mode intermediate_mode, prev_mode; + enum machine_mode intermediate_mode; optab optab3, optab4; *multi_step_cvt = 0; @@ -6634,11 +6638,17 @@ supportable_widening_operation (enum tree_code code, gimple stmt, types. */ prev_type = vectype; - prev_mode = vec_mode; - if (!CONVERT_EXPR_CODE_P (code)) + /* For WIDEN_MULT_EXPR, it is possible that two steps are needed. For + example, consider WIDEN_MULT_EXPR from char to int. In the first step, + we can get the widen-mult result as short, and then widen it again to int. + VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR are used in the second step. */ + if (!CONVERT_EXPR_CODE_P (code) && code != WIDEN_MULT_EXPR) return false; + c1 = VEC_UNPACK_LO_EXPR; + c2 = VEC_UNPACK_HI_EXPR; + /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS intermediate steps in promotion sequence. We try MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do @@ -6654,10 +6664,6 @@ supportable_widening_operation (enum tree_code code, gimple stmt, optab4 = optab_for_tree_code (c2, intermediate_type, optab_default); if (!optab3 || !optab4 - || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing - || insn_data[icode1].operand[0].mode != intermediate_mode - || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing - || insn_data[icode2].operand[0].mode != intermediate_mode || ((icode1 = optab_handler (optab3, intermediate_mode)) == CODE_FOR_nothing) || ((icode2 = optab_handler (optab4, intermediate_mode)) @@ -6672,7 +6678,6 @@ supportable_widening_operation (enum tree_code code, gimple stmt, return true; prev_type = intermediate_type; - prev_mode = intermediate_mode; } interm_types->release ();
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f298c0b..44ed204 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2013-12-02 Cong Hou <co...@google.com> + + * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Enhance + the widen-mult pattern by handling two operands with different + sizes. + * tree-vect-stmts.c (vectorizable_conversion): Allow multi-steps + conversions after widening mult operation. + (supportable_widening_operation): Likewise. + 2013-11-22 Jakub Jelinek <ja...@redhat.com> PR sanitizer/59061 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 12d2c90..611ae1c 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-12-02 Cong Hou <co...@google.com> + + * gcc.dg/vect/vect-widen-mult-u8-s16-s32.c: New test. + * gcc.dg/vect/vect-widen-mult-u8-u32.c: New test. + 2013-11-22 Jakub Jelinek <ja...@redhat.com> * c-c++-common/asan/no-redundant-instrumentation-7.c: Fix diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c new file mode 100644 index 0000000..9f9081b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c @@ -0,0 +1,48 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 64 + +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +int result[N]; + +/* unsigned char * short -> int widening-mult. */ +__attribute__ ((noinline)) int +foo1(int len) { + int i; + + for (i=0; i<len; i++) { + result[i] = X[i] * Y[i]; + } +} + +int main (void) +{ + int i; + + check_vect (); + + for (i=0; i<N; i++) { + X[i] = i; + Y[i] = 64-i; + __asm__ volatile (""); + } + + foo1 (N); + + for (i=0; i<N; i++) { + if (result[i] != X[i] * Y[i]) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */ +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c new file mode 100644 index 0000000..12c4692 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c @@ -0,0 +1,48 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 64 + +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +unsigned int result[N]; + +/* unsigned char-> unsigned int widening-mult. */ +__attribute__ ((noinline)) int +foo1(int len) { + int i; + + for (i=0; i<len; i++) { + result[i] = X[i] * Y[i]; + } +} + +int main (void) +{ + int i; + + check_vect (); + + for (i=0; i<N; i++) { + X[i] = i; + Y[i] = 64-i; + __asm__ volatile (""); + } + + foo1 (N); + + for (i=0; i<N; i++) { + if (result[i] != X[i] * Y[i]) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */ +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index 7823cc3..06279bc 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -529,7 +529,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum tree_code code, Try to find the following pattern: - type a_t, b_t; + type1 a_t; + type2 b_t; TYPE a_T, b_T, prod_T; S1 a_t = ; @@ -538,11 +539,12 @@ vect_handle_widen_op_by_const (gimple stmt, enum tree_code code, S4 b_T = (TYPE) b_t; S5 prod_T = a_T * b_T; - where type 'TYPE' is at least double the size of type 'type'. + where type 'TYPE' is at least double the size of type 'type1' and 'type2'. Also detect unsigned cases: - unsigned type a_t, b_t; + unsigned type1 a_t; + unsigned type2 b_t; unsigned TYPE u_prod_T; TYPE a_T, b_T, prod_T; @@ -661,6 +663,50 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts, return NULL; } + /* If the two arguments have different sizes, convert the one with + the smaller type into the larger type. */ + if (TYPE_PRECISION (half_type0) != TYPE_PRECISION (half_type1)) + { + tree* oprnd = NULL; + gimple def_stmt = NULL; + + if (TYPE_PRECISION (half_type0) < TYPE_PRECISION (half_type1)) + { + def_stmt = def_stmt0; + half_type0 = half_type1; + oprnd = &oprnd0; + } + else + { + def_stmt = def_stmt1; + half_type1 = half_type0; + oprnd = &oprnd1; + } + + if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt))) + { + gimple new_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)); + /* Check if the already created pattern stmt is what we need. */ + if (!is_gimple_assign (new_stmt) + || gimple_assign_rhs_code (new_stmt) != NOP_EXPR + || TREE_TYPE (gimple_assign_lhs (new_stmt)) != half_type0) + return NULL; + + stmts->safe_push (def_stmt); + *oprnd = gimple_assign_lhs (new_stmt); + } + else + { + tree old_oprnd = gimple_assign_rhs1 (def_stmt); + tree new_oprnd = make_ssa_name (half_type0, NULL); + gimple new_stmt = gimple_build_assign_with_ops (NOP_EXPR, new_oprnd, + old_oprnd, NULL_TREE); + STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)) = new_stmt; + stmts->safe_push (def_stmt); + *oprnd = new_oprnd; + } + } + /* Handle unsigned case. Look for S6 u_prod_T = (unsigned TYPE) prod_T; Use unsigned TYPE as the type for WIDEN_MULT_EXPR. */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 72dfacd..e1ca2a2 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -2504,12 +2504,7 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi, if (supportable_widening_operation (code, stmt, vectype_out, vectype_in, &code1, &code2, &multi_step_cvt, &interm_types)) - { - /* Binary widening operation can only be supported directly by the - architecture. */ - gcc_assert (!(multi_step_cvt && op_type == binary_op)); - break; - } + break; if (code != FLOAT_EXPR || (GET_MODE_SIZE (TYPE_MODE (lhs_type)) @@ -2787,6 +2782,15 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi, c1 = codecvt1; c2 = codecvt2; } + else if (op_type == binary_op && i < multi_step_cvt) + { + /* For binary widening operations, if multi-steps are needed, + use VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to convert + intermediate result to final one. */ + c1 = VEC_UNPACK_LO_EXPR; + c2 = VEC_UNPACK_HI_EXPR; + op_type = unary_op; + } vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1, stmt, this_dest, gsi, @@ -6510,7 +6514,7 @@ supportable_widening_operation (enum tree_code code, gimple stmt, enum tree_code c1, c2; int i; tree prev_type, intermediate_type; - enum machine_mode intermediate_mode, prev_mode; + enum machine_mode intermediate_mode; optab optab3, optab4; *multi_step_cvt = 0; @@ -6634,11 +6638,17 @@ supportable_widening_operation (enum tree_code code, gimple stmt, types. */ prev_type = vectype; - prev_mode = vec_mode; - if (!CONVERT_EXPR_CODE_P (code)) + /* For WIDEN_MULT_EXPR, it is possible that two steps are needed. For + example, consider WIDEN_MULT_EXPR from char to int. In the first step, + we can get the widen-mult result as short, and then widen it again to int. + VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR are used in the second step. */ + if (!CONVERT_EXPR_CODE_P (code) && code != WIDEN_MULT_EXPR) return false; + c1 = VEC_UNPACK_LO_EXPR; + c2 = VEC_UNPACK_HI_EXPR; + /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS intermediate steps in promotion sequence. We try MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do @@ -6654,10 +6664,6 @@ supportable_widening_operation (enum tree_code code, gimple stmt, optab4 = optab_for_tree_code (c2, intermediate_type, optab_default); if (!optab3 || !optab4 - || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing - || insn_data[icode1].operand[0].mode != intermediate_mode - || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing - || insn_data[icode2].operand[0].mode != intermediate_mode || ((icode1 = optab_handler (optab3, intermediate_mode)) == CODE_FOR_nothing) || ((icode2 = optab_handler (optab4, intermediate_mode)) @@ -6672,7 +6678,6 @@ supportable_widening_operation (enum tree_code code, gimple stmt, return true; prev_type = intermediate_type; - prev_mode = intermediate_mode; } interm_types->release ();