Hello,dropping the builtin as early as possible seems like it can only help us optimize the code. Jakub suggested in the PR that he liked this approach better than using __builtin_shuffle in the header. There is already some coverage in the testsuite (as I noticed when I tried to restrict the argument to [0, 3]...).
If this one is ok, I may add a few more (say shufps to begin with) later. Bootstrap+regtest on x86_64-pc-linux-gnu. 2019-05-20 Marc Glisse <marc.gli...@inria.fr> PR rtl-optimization/43147 * config/i386/i386.c (ix86_gimple_fold_builtin): Handle IX86_BUILTIN_SHUFPD. -- Marc Glisse
Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 271376) +++ gcc/config/i386/i386.c (working copy) @@ -17290,21 +17290,21 @@ ix86_fold_builtin (tree fndecl, int n_ar bool ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) { gimple *stmt = gsi_stmt (*gsi); tree fndecl = gimple_call_fndecl (stmt); gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD)); int n_args = gimple_call_num_args (stmt); enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl); tree decl = NULL_TREE; - tree arg0, arg1; + tree arg0, arg1, arg2; enum rtx_code rcode; unsigned HOST_WIDE_INT count; bool is_vshift; switch (fn_code) { case IX86_BUILTIN_TZCNT32: decl = builtin_decl_implicit (BUILT_IN_CTZ); goto fold_tzcnt_lzcnt; @@ -17594,20 +17594,46 @@ ix86_gimple_fold_builtin (gimple_stmt_it arithmetic right shift the result is zero. */ location_t loc = gimple_location (stmt); gimple *g = gimple_build_assign (gimple_call_lhs (stmt), build_zero_cst (TREE_TYPE (arg0))); gimple_set_location (g, loc); gsi_replace (gsi, g, false); return true; } break; + case IX86_BUILTIN_SHUFPD: + arg2 = gimple_call_arg (stmt, 2); + if (TREE_CODE (arg2) == INTEGER_CST) + { + location_t loc = gimple_location (stmt); + unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + tree itype = long_long_integer_type_node; + tree vtype = build_vector_type (itype, 2); /* V2DI */ + tree_vector_builder elts (vtype, 2, 1); + /* Ignore bits other than the lowest 2. */ + elts.quick_push (build_int_cst (itype, imask & 1)); + imask >>= 1; + elts.quick_push (build_int_cst (itype, 2 + (imask & 1))); + tree omask = elts.build (); + gimple *g = gimple_build_assign (gimple_call_lhs (stmt), + VEC_PERM_EXPR, + arg0, arg1, omask); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + return true; + } + // Do not error yet, the constant could be propagated later? + break; + default: break; } return false; } /* Handler for an SVML-style interface to a library with vectorized intrinsics. */