Given the following loop:

int a[N];
short b[N*2];

for (int i = 0; i < N; ++i)
  a[i] = b[i*2];


After being vectorized, the access to b[i*2] will be compiled into
several packing statements, while the type promotion from short to int
will be compiled into several unpacking statements. With this patch,
each pair of pack/unpack statements will be replaced by less expensive
statements (with shift or bit-and operations).

On x86_64, the loop above will be compiled into the following assembly
(with -O2 -ftree-vectorize):

movdqu 0x10(%rcx),%xmm3
movdqu -0x20(%rcx),%xmm0
movdqa %xmm0,%xmm2
punpcklwd %xmm3,%xmm0
punpckhwd %xmm3,%xmm2
movdqa %xmm0,%xmm3
punpcklwd %xmm2,%xmm0
punpckhwd %xmm2,%xmm3
movdqa %xmm1,%xmm2
punpcklwd %xmm3,%xmm0
pcmpgtw %xmm0,%xmm2
movdqa %xmm0,%xmm3
punpckhwd %xmm2,%xmm0
punpcklwd %xmm2,%xmm3
movups %xmm0,-0x10(%rdx)
movups %xmm3,-0x20(%rdx)


With this patch, the generated assembly is shown below:

movdqu 0x10(%rcx),%xmm0
movdqu -0x20(%rcx),%xmm1
pslld  $0x10,%xmm0
psrad  $0x10,%xmm0
pslld  $0x10,%xmm1
movups %xmm0,-0x10(%rdx)
psrad  $0x10,%xmm1
movups %xmm1,-0x20(%rdx)


Bootstrapped and tested on x86-64. OK for trunk?


thanks,
Cong
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 117cdd0..e7143f1 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2014-04-23  Cong Hou  <co...@google.com>
+
+       * tree-vect-stmts.c (detect_pack_unpack_pattern): New function.
+       (vect_gen_widened_results_half): Call detect_pack_unpack_pattern.
+
 2014-04-23  David Malcolm  <dmalc...@redhat.com>
 
        * is-a.h: Update comments to reflect the following changes to the
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 62b07f4..a8755b3 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2014-04-23  Cong Hou  <co...@google.com>
+
+       * gcc.dg/vect/vect-125.c: New test.
+
 2014-04-23  Jeff Law  <l...@redhat.com>
 
        PR tree-optimization/60902
diff --git a/gcc/testsuite/gcc.dg/vect/vect-125.c 
b/gcc/testsuite/gcc.dg/vect/vect-125.c
new file mode 100644
index 0000000..988dea6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-125.c
@@ -0,0 +1,122 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <limits.h>
+#include "tree-vect.h"
+
+#define N 64
+
+char b[N];
+unsigned char c[N];
+short d[N];
+unsigned short e[N];
+
+__attribute__((noinline)) void
+test1 ()
+{
+  int a[N];
+  int i;
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = b[i*2];
+      a[i+N/2] = b[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = c[i*2];
+      a[i+N/2] = c[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = d[i*2];
+      a[i+N/2] = d[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = e[i*2];
+      a[i+N/2] = e[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1])
+      abort ();
+}
+
+__attribute__((noinline)) void
+test2 ()
+{
+  unsigned int a[N];
+  int i;
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = b[i*2];
+      a[i+N/2] = b[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = c[i*2];
+      a[i+N/2] = c[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = d[i*2];
+      a[i+N/2] = d[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = e[i*2];
+      a[i+N/2] = e[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1])
+      abort ();
+}
+
+int
+main ()
+{
+  b[0] = CHAR_MIN;
+  c[0] = UCHAR_MAX;
+  d[0] = SHRT_MIN;
+  e[0] = USHRT_MAX;
+
+  int i;
+  for (i = 1; i < N; i++)
+    {
+      b[i] = b[i-1] + 1;
+      c[i] = c[i-1] - 1;
+      d[i] = d[i-1] + 1;
+      e[i] = e[i-1] - 1;
+    }
+
+  test1 ();
+  test2 ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "A pack-unpack pattern is recognized" 32 
"vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 1a51d6d..d0cf1f4 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -3191,6 +3191,174 @@ vectorizable_simd_clone_call (gimple stmt, 
gimple_stmt_iterator *gsi,
 }
 
 
+/* Function detect_pack_unpack_pattern
+
+   Detect the following pattern:
+
+   S1  vect3 = VEC_PERM_EXPR <vect1, vect2, { 0, 2, 4, ... }>;
+   or
+   S1  vect3 = VEC_PERM_EXPR <vect1, vect2, { 1, 3, 5, ... }>;
+
+   S2  vect4 = [vec_unpack_lo_expr] vect3;
+   or/and
+   S3  vect5 = [vec_unpack_hi_expr] vect3;
+
+   S1 is usually generated from accessing even/odd elements of an array
+   and S2/S3 are generated from type promotion.  In this case S1 and S2/S3 can
+   be replaced by less expensive statements.  */
+
+static gimple
+detect_pack_unpack_pattern (enum tree_code code, tree vec_oprnd, int op_type,
+                           tree vec_dest, gimple_stmt_iterator *gsi,
+                           gimple stmt)
+{
+  if ((code != VEC_UNPACK_LO_EXPR && code != VEC_UNPACK_HI_EXPR)
+      || op_type != unary_op)
+    return NULL;
+
+  if (TREE_CODE (vec_oprnd) != SSA_NAME)
+    return NULL;
+
+  gimple def_stmt = SSA_NAME_DEF_STMT (vec_oprnd);
+  if (!is_gimple_assign (def_stmt))
+    return NULL;
+  enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
+  if (def_code != VEC_PERM_EXPR)
+    return NULL;
+
+  bool unpack_lo = (code == VEC_UNPACK_LO_EXPR);
+  tree perm_vec = gimple_assign_rhs3 (def_stmt);
+  gcc_assert (TREE_CODE (perm_vec) == VECTOR_CST);
+
+  /* Check if VEC_PERM_EXPR is generated from accessing even/odd elements of
+     an array.  */
+  int nelts = VECTOR_CST_NELTS (perm_vec);
+  int odd_num = 0;
+  for (int i = 0; i < nelts / 2; i++)
+    {
+      tree elt = VECTOR_CST_ELT (perm_vec,
+                                unpack_lo ? i : i + nelts / 2);
+      int elt_val = (HOST_WIDE_INT) TREE_INT_CST_LOW (elt);
+      if (!unpack_lo)
+       elt_val -= nelts;
+
+      if (elt_val / 2 != i)
+       return NULL;
+      odd_num += elt_val & 1;
+    }
+
+  bool even_elem = false;
+  if (odd_num == 0)
+    even_elem = true;
+  else if (odd_num != nelts / 2)
+    return NULL;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "A pack-unpack pattern is recognized.\n");
+
+  tree vec_oprnd0 = unpack_lo ? gimple_assign_rhs1 (def_stmt)
+                             : gimple_assign_rhs2 (def_stmt);
+  tree vec_type = TREE_TYPE (vec_oprnd0);
+  tree scalar_type = lang_hooks.types.type_for_mode (
+                      GET_MODE_INNER (TYPE_MODE (vec_type)),
+                      TYPE_UNSIGNED (vec_type));
+  tree vectype_out = TREE_TYPE (vec_dest);
+
+  if (even_elem)
+    {
+      /* If even elements are packed, and if the original values are unsigned,
+        build a bit_and statement to replace the pack-unpack statements.
+        If the original values are signed, build a lshift statement and a
+        rshift statement to replace the pack-unpack statements.  */
+
+      if (TYPE_UNSIGNED (vec_type))
+       {
+         tree *mask = XALLOCAVEC (tree, nelts);
+         unsigned HOST_WIDE_INT max_val = TREE_INT_CST_LOW (
+                                            TYPE_MAX_VALUE (scalar_type));
+         for (int k = 0; k < nelts; ++k)
+           mask[k] = build_int_cst (scalar_type, (k & 1) ? 0 : max_val);
+         tree vec_oprnd1 = build_vector (TREE_TYPE (vec_oprnd0), mask);
+         tree temp = make_temp_ssa_name (vec_type, NULL, "vect_temp");
+
+         gimple new_stmt1, new_stmt2;
+         new_stmt1 = gimple_build_assign_with_ops (BIT_AND_EXPR, temp,
+                                                   vec_oprnd0, vec_oprnd1);
+         new_stmt2 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+                                                   temp, NULL_TREE);
+         gimple_assign_set_lhs (new_stmt2,
+                                make_ssa_name (vec_dest, new_stmt2));
+
+         vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+         vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+
+         return new_stmt2;
+       }
+      else
+       {
+         tree scalar_itype = lang_hooks.types.type_for_mode (
+                               GET_MODE_INNER (TYPE_MODE (vectype_out)), 0);
+         tree vecitype = get_vectype_for_scalar_type (scalar_itype);
+         tree shift_oprnd = build_int_cst (scalar_type,
+                                           TYPE_PRECISION (scalar_type));
+
+         tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+         tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+         tree temp3 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+
+         gimple new_stmt1, new_stmt2, new_stmt3, new_stmt4;
+         new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1,
+                                                   vec_oprnd0, NULL);
+         new_stmt2 = gimple_build_assign_with_ops (LSHIFT_EXPR, temp2,
+                                                   temp1, shift_oprnd);
+         new_stmt3 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp3,
+                                                   temp2, shift_oprnd);
+         new_stmt4 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+                                                   temp3, NULL);
+         gimple_assign_set_lhs (new_stmt4,
+                                make_ssa_name (vec_dest, new_stmt4));
+
+         vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+         vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+         vect_finish_stmt_generation (stmt, new_stmt3, gsi);
+         vect_finish_stmt_generation (stmt, new_stmt4, gsi);
+
+         return new_stmt4;
+       }
+    }
+  else
+    {
+      /* If odd elements are packed, build a rshift statement to replace
+        the pack-unpack statements.  */
+
+      gimple new_stmt1, new_stmt2, new_stmt3;
+      tree scalar_itype = lang_hooks.types.type_for_mode (
+                           GET_MODE_INNER (TYPE_MODE (vectype_out)),
+                           TYPE_UNSIGNED (scalar_type));
+      tree vecitype = get_vectype_for_scalar_type (scalar_itype);
+      tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+      tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+      tree shift_oprnd = build_int_cst (scalar_type,
+                                       TYPE_PRECISION (scalar_type));
+
+      new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1,
+                                               vec_oprnd0, NULL);
+      new_stmt2 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp2,
+                                               temp1, shift_oprnd);
+      new_stmt3 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+                                               temp2, NULL);
+      gimple_assign_set_lhs (new_stmt3,
+                            make_ssa_name (vec_dest, new_stmt2));
+
+      vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+      vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+      vect_finish_stmt_generation (stmt, new_stmt3, gsi);
+
+      return new_stmt3;
+    }
+}
+
 /* Function vect_gen_widened_results_half
 
    Create a vector stmt whose code, type, number of arguments, and result
@@ -3223,10 +3391,15 @@ vect_gen_widened_results_half (enum tree_code code,
     }
   else
     {
+      if ((new_stmt = detect_pack_unpack_pattern (
+                         code, vec_oprnd0, op_type, vec_dest, gsi, stmt)))
+        return new_stmt;
+
       /* Generic support */
       gcc_assert (op_type == TREE_CODE_LENGTH (code));
       if (op_type != binary_op)
        vec_oprnd1 = NULL;
+
       new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
                                               vec_oprnd1);
       new_temp = make_ssa_name (vec_dest, new_stmt);

Reply via email to