Hello,

dropping the builtin as early as possible seems like it can only help us optimize the code. Jakub suggested in the PR that he liked this approach better than using __builtin_shuffle in the header. There is already some coverage in the testsuite (as I noticed when I tried to restrict the argument to [0, 3]...).

If this one is ok, I may add a few more (say shufps to begin with) later.

Bootstrap+regtest on x86_64-pc-linux-gnu.

2019-05-20  Marc Glisse  <marc.gli...@inria.fr>

        PR rtl-optimization/43147
        * config/i386/i386.c (ix86_gimple_fold_builtin): Handle
        IX86_BUILTIN_SHUFPD.


--
Marc Glisse
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c      (revision 271376)
+++ gcc/config/i386/i386.c      (working copy)
@@ -17290,21 +17290,21 @@ ix86_fold_builtin (tree fndecl, int n_ar
 
 bool
 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 {
   gimple *stmt = gsi_stmt (*gsi);
   tree fndecl = gimple_call_fndecl (stmt);
   gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
   int n_args = gimple_call_num_args (stmt);
   enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE 
(fndecl);
   tree decl = NULL_TREE;
-  tree arg0, arg1;
+  tree arg0, arg1, arg2;
   enum rtx_code rcode;
   unsigned HOST_WIDE_INT count;
   bool is_vshift;
 
   switch (fn_code)
     {
     case IX86_BUILTIN_TZCNT32:
       decl = builtin_decl_implicit (BUILT_IN_CTZ);
       goto fold_tzcnt_lzcnt;
 
@@ -17594,20 +17594,46 @@ ix86_gimple_fold_builtin (gimple_stmt_it
             arithmetic right shift the result is zero.  */
          location_t loc = gimple_location (stmt);
          gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
                                           build_zero_cst (TREE_TYPE (arg0)));
          gimple_set_location (g, loc);
          gsi_replace (gsi, g, false);
          return true;
        }
       break;
 
+    case IX86_BUILTIN_SHUFPD:
+      arg2 = gimple_call_arg (stmt, 2);
+      if (TREE_CODE (arg2) == INTEGER_CST)
+       {
+         location_t loc = gimple_location (stmt);
+         unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
+         arg0 = gimple_call_arg (stmt, 0);
+         arg1 = gimple_call_arg (stmt, 1);
+         tree itype = long_long_integer_type_node;
+         tree vtype = build_vector_type (itype, 2); /* V2DI */
+         tree_vector_builder elts (vtype, 2, 1);
+         /* Ignore bits other than the lowest 2.  */
+         elts.quick_push (build_int_cst (itype, imask & 1));
+         imask >>= 1;
+         elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
+         tree omask = elts.build ();
+         gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
+                                          VEC_PERM_EXPR,
+                                          arg0, arg1, omask);
+         gimple_set_location (g, loc);
+         gsi_replace (gsi, g, false);
+         return true;
+       }
+      // Do not error yet, the constant could be propagated later?
+      break;
+
     default:
       break;
     }
 
   return false;
 }
 
 /* Handler for an SVML-style interface to
    a library with vectorized intrinsics.  */
 

Reply via email to