Hello!

2016-10-25  Uros Bizjak  <ubiz...@gmail.com>

    Backport from mainline
    2016-10-21  Jakub Jelinek  <ja...@redhat.com>

    PR target/78057
    * config/i386/i386.c: Include fold-const-call.h, tree-vrp.h
    and tree-ssanames.h.
    (ix86_fold_builtin): Fold IX86_BUILTIN_[LT]ZCNT{16,32,64}
    with INTEGER_CST argument.
    (ix86_gimple_fold_builtin): New function.
    (TARGET_GIMPLE_FOLD_BUILTIN): Define.

    Backport from mainline
    2016-10-20 Uros Bizjak  <ubiz...@gmail.com>

    PR target/78037
    * config/i386/bmiintrin.h (__tzcnt_u16): Call __builtin_ia32_tzcnt_u16.
    (__tzcnt_u32, _tzcnt_u32): Call __builtin_ia32_tzcnt_u32.
    (__tzcnt_u64, _tzcnt_u64): Call __builtin_ia32_tzcnt_u64.
    * config/i386/lzcntintrin.h (__lzcnt_u16): Call
    __builtin_ia32_lzcnt_u16.
    (__lzcnt_u32, _lzcnt_u32): Call __builtin_ia32_lzcnt_u32.
    (__lzcnt_u64, _lzcnt_u64): Call __builtin_ia32_lzcnt_u64.
    * config/i386/i386.md (UNSPEC_LZCNT, UNSPEC_TZCNT): New unspecs.
    (ctz<mode>2, *ctz<mode>2): Use SWI48 mode iterator.
    (bmi_tzcnt_<mode>): New expander.
    (*bmi_tzcnt_<mode>_falsedep_1): New define_insn_and_split pattern.
    (*bmi_tzcnt_<mode>_falsedep, *bmi_tzcnt_<mode>): New insn patterns.
    (clz<mode>2_lzcnt, *clz<mode>2_lzcnt): Use SWI48 mode iterator.
    (lzcnt_<mode>): New expander.
    (*lzcnt_<mode>_falsedep_1): New define_insn_and_split pattern.
    (*lzcnt_<mode>_falsedep, *lzcnt_<mode>): New insn patterns.
    * config/i386/i386-builtin-types.def (UINT_FTYPE_UINT): New.
    (UINT64_FTYPE_UINT64): New.
    * config/i386/i386-builtin.def (__builtin_clzs): Remove description.
    (__builtin_ia32_lzcnt_u16): New description.
    (__builtin_ia32_lzcnt_u32): Ditto.
    (__builtin_ia32_lzcnt_u64): Ditto.
    (__builtin_ctzs): Remove description.
    (__builtin_ia32_tzcnt_u16): New description.
    (__builtin_ia32_tzcnt_u32): Ditto.
    (__builtin_ia32_tzcnt_u64): Ditto.
    * config/i386/i386.c (ix86_expand_args_builtin): Handle
    UINT_FTYPE_UINT and UINT64_FTYPE_UINT64.

testsuite/ChangeLog:

2016-10-25  Uros Bizjak  <ubiz...@gmail.com>

    * gcc.target/i386/bmi-6.c: XFAIL.

    Backport from mainline
    2016-10-21  Jakub Jelinek  <ja...@redhat.com>

    PR target/78057
    * gcc.target/i386/pr78057.c: New test.

    Backport from mainline
    2016-10-20  Uros Bizjak  <ubiz...@gmail.com>

    PR target/78037
    * gcc.target/i386/pr78037.c: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Please note that I have to XFAIL bmi-6.c due to insufficient inlining
on gcc-6 branch.

If there are no objections, I plan to commit the patch tomorrow.

Uros.
Index: config/i386/bmiintrin.h
===================================================================
--- config/i386/bmiintrin.h     (revision 241451)
+++ config/i386/bmiintrin.h     (working copy)
@@ -37,7 +37,7 @@
 extern __inline unsigned short __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
 __tzcnt_u16 (unsigned short __X)
 {
-  return __builtin_ctzs (__X);
+  return __builtin_ia32_tzcnt_u16 (__X);
 }
 
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -97,13 +97,13 @@ _blsr_u32 (unsigned int __X)
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 __tzcnt_u32 (unsigned int __X)
 {
-  return __builtin_ctz (__X);
+  return __builtin_ia32_tzcnt_u32 (__X);
 }
 
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _tzcnt_u32 (unsigned int __X)
 {
-  return __builtin_ctz (__X);
+  return __builtin_ia32_tzcnt_u32 (__X);
 }
 
 
@@ -165,13 +165,13 @@ _blsr_u64 (unsigned long long __X)
 extern __inline unsigned long long __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
 __tzcnt_u64 (unsigned long long __X)
 {
-  return __builtin_ctzll (__X);
+  return __builtin_ia32_tzcnt_u64 (__X);
 }
 
 extern __inline unsigned long long __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
 _tzcnt_u64 (unsigned long long __X)
 {
-  return __builtin_ctzll (__X);
+  return __builtin_ia32_tzcnt_u64 (__X);
 }
 
 #endif /* __x86_64__  */
Index: config/i386/i386-builtin-types.def
===================================================================
--- config/i386/i386-builtin-types.def  (revision 241451)
+++ config/i386/i386-builtin-types.def  (working copy)
@@ -201,9 +201,11 @@ DEF_FUNCTION_TYPE (INT, PCCHAR)
 DEF_FUNCTION_TYPE (INT64, INT64)
 DEF_FUNCTION_TYPE (INT64, V2DF)
 DEF_FUNCTION_TYPE (INT64, V4SF)
+DEF_FUNCTION_TYPE (UINT, UINT)
+DEF_FUNCTION_TYPE (UINT16, UINT16)
 DEF_FUNCTION_TYPE (UINT64, INT)
-DEF_FUNCTION_TYPE (UINT16, UINT16)
 DEF_FUNCTION_TYPE (UINT64, PUNSIGNED)
+DEF_FUNCTION_TYPE (UINT64, UINT64)
 DEF_FUNCTION_TYPE (V16QI, PCCHAR)
 DEF_FUNCTION_TYPE (V16QI, V16QI)
 DEF_FUNCTION_TYPE (V2DF, PCDOUBLE)
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 241451)
+++ config/i386/i386.c  (working copy)
@@ -76,6 +76,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "case-cfn-macros.h"
 #include "regrename.h"
 #include "dojump.h"
+#include "fold-const-call.h"
+#include "tree-ssanames.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -32356,7 +32358,10 @@ enum ix86_builtins
   IX86_BUILTIN_LWPINS32,
   IX86_BUILTIN_LWPINS64,
 
-  IX86_BUILTIN_CLZS,
+  /* LZCNT */
+  IX86_BUILTIN_LZCNT16,
+  IX86_BUILTIN_LZCNT32,
+  IX86_BUILTIN_LZCNT64,
 
   /* RTM */
   IX86_BUILTIN_XBEGIN,
@@ -32380,7 +32385,9 @@ enum ix86_builtins
   /* BMI instructions.  */
   IX86_BUILTIN_BEXTR32,
   IX86_BUILTIN_BEXTR64,
-  IX86_BUILTIN_CTZS,
+  IX86_BUILTIN_TZCNT16,
+  IX86_BUILTIN_TZCNT32,
+  IX86_BUILTIN_TZCNT64,
 
   /* TBM instructions.  */
   IX86_BUILTIN_BEXTRI32,
@@ -33768,13 +33775,19 @@ static const struct builtin_description bdesc_args
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", 
IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", 
IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
 
-  { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt,   "__builtin_clzs",   
IX86_BUILTIN_CLZS,    UNKNOWN,     (int) UINT16_FTYPE_UINT16 },
+  /* LZCNT */
+  { OPTION_MASK_ISA_LZCNT, CODE_FOR_lzcnt_hi, "__builtin_ia32_lzcnt_u16", 
IX86_BUILTIN_LZCNT16, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
+  { OPTION_MASK_ISA_LZCNT, CODE_FOR_lzcnt_si, "__builtin_ia32_lzcnt_u32", 
IX86_BUILTIN_LZCNT32, UNKNOWN, (int) UINT_FTYPE_UINT },
+  { OPTION_MASK_ISA_LZCNT | OPTION_MASK_ISA_64BIT, CODE_FOR_lzcnt_di, 
"__builtin_ia32_lzcnt_u64", IX86_BUILTIN_LZCNT64, UNKNOWN, (int) 
UINT64_FTYPE_UINT64 },
 
   /* BMI */
   { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", 
IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
   { OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_64BIT, CODE_FOR_bmi_bextr_di, 
"__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) 
UINT64_FTYPE_UINT64_UINT64 },
-  { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2,       "__builtin_ctzs",           
IX86_BUILTIN_CTZS,    UNKNOWN, (int) UINT16_FTYPE_UINT16 },
 
+  { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_tzcnt_hi, "__builtin_ia32_tzcnt_u16", 
IX86_BUILTIN_TZCNT16, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
+  { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_tzcnt_si, "__builtin_ia32_tzcnt_u32", 
IX86_BUILTIN_TZCNT32, UNKNOWN, (int) UINT_FTYPE_UINT },
+  { OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_64BIT, CODE_FOR_bmi_tzcnt_di, 
"__builtin_ia32_tzcnt_u64", IX86_BUILTIN_TZCNT64, UNKNOWN, (int) 
UINT64_FTYPE_UINT64 },
+
   /* TBM */
   { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", 
IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
   { OPTION_MASK_ISA_TBM | OPTION_MASK_ISA_64BIT, CODE_FOR_tbm_bextri_di, 
"__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) 
UINT64_FTYPE_UINT64_UINT64 },
@@ -37546,11 +37559,49 @@ ix86_fold_builtin (tree fndecl, int n_args,
     {
       enum ix86_builtins fn_code = (enum ix86_builtins)
                                   DECL_FUNCTION_CODE (fndecl);
-      if (fn_code ==  IX86_BUILTIN_CPU_IS
-         || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
+      switch (fn_code)
        {
+       case IX86_BUILTIN_CPU_IS:
+       case IX86_BUILTIN_CPU_SUPPORTS:
          gcc_assert (n_args == 1);
           return fold_builtin_cpu (fndecl, args);
+
+       case IX86_BUILTIN_TZCNT16:
+       case IX86_BUILTIN_TZCNT32:
+       case IX86_BUILTIN_TZCNT64:
+         gcc_assert (n_args == 1);
+         if (TREE_CODE (args[0]) == INTEGER_CST)
+           {
+             tree type = TREE_TYPE (TREE_TYPE (fndecl));
+             tree arg = args[0];
+             if (fn_code == IX86_BUILTIN_TZCNT16)
+               arg = fold_convert (short_unsigned_type_node, arg);
+             if (integer_zerop (arg))
+               return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
+             else
+               return fold_const_call (CFN_CTZ, type, arg);
+           }
+         break;
+
+       case IX86_BUILTIN_LZCNT16:
+       case IX86_BUILTIN_LZCNT32:
+       case IX86_BUILTIN_LZCNT64:
+         gcc_assert (n_args == 1);
+         if (TREE_CODE (args[0]) == INTEGER_CST)
+           {
+             tree type = TREE_TYPE (TREE_TYPE (fndecl));
+             tree arg = args[0];
+             if (fn_code == IX86_BUILTIN_LZCNT16)
+               arg = fold_convert (short_unsigned_type_node, arg);
+             if (integer_zerop (arg))
+               return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
+             else
+               return fold_const_call (CFN_CLZ, type, arg);
+           }
+         break;
+
+       default:
+         break;
        }
     }
 
@@ -37561,6 +37612,70 @@ ix86_fold_builtin (tree fndecl, int n_args,
   return NULL_TREE;
 }
 
+/* Fold a MD builtin (use ix86_fold_builtin for folding into
+   constant) in GIMPLE.  */
+
+bool
+ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
+{
+  gimple *stmt = gsi_stmt (*gsi);
+  tree fndecl = gimple_call_fndecl (stmt);
+  gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
+  int n_args = gimple_call_num_args (stmt);
+  enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE 
(fndecl);
+  tree decl = NULL_TREE;
+  tree arg0;
+
+  switch (fn_code)
+    {
+    case IX86_BUILTIN_TZCNT32:
+      decl = builtin_decl_implicit (BUILT_IN_CTZ);
+      goto fold_tzcnt_lzcnt;
+
+    case IX86_BUILTIN_TZCNT64:
+      decl = builtin_decl_implicit (BUILT_IN_CTZLL);
+      goto fold_tzcnt_lzcnt;
+
+    case IX86_BUILTIN_LZCNT32:
+      decl = builtin_decl_implicit (BUILT_IN_CLZ);
+      goto fold_tzcnt_lzcnt;
+
+    case IX86_BUILTIN_LZCNT64:
+      decl = builtin_decl_implicit (BUILT_IN_CLZLL);
+      goto fold_tzcnt_lzcnt;
+
+    fold_tzcnt_lzcnt:
+      gcc_assert (n_args == 1);
+      arg0 = gimple_call_arg (stmt, 0);
+      if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
+       {
+         int prec = TYPE_PRECISION (TREE_TYPE (arg0));
+         /* If arg0 is provably non-zero, optimize into generic
+            __builtin_c[tl]z{,ll} function the middle-end handles
+            better.  */
+         if (!expr_not_equal_to (arg0, wi::zero (prec)))
+           return false;
+
+         location_t loc = gimple_location (stmt);
+         gimple *g = gimple_build_call (decl, 1, arg0);
+         gimple_set_location (g, loc);
+         tree lhs = make_ssa_name (integer_type_node);
+         gimple_call_set_lhs (g, lhs);
+         gsi_insert_before (gsi, g, GSI_SAME_STMT);
+         g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
+         gimple_set_location (g, loc);
+         gsi_replace (gsi, g, true);
+         return true;
+       }
+      break;
+
+    default:
+      break;
+    }
+
+  return false;
+}
+
 /* Make builtins to detect cpu type and features supported.  NAME is
    the builtin name, CODE is the builtin code, and FTYPE is the function
    type of the builtin.  */
@@ -38522,8 +38637,10 @@ ix86_expand_args_builtin (const struct builtin_des
     case FLOAT128_FTYPE_FLOAT128:
     case FLOAT_FTYPE_FLOAT:
     case INT_FTYPE_INT:
+    case UINT_FTYPE_UINT:
+    case UINT16_FTYPE_UINT16:
     case UINT64_FTYPE_INT:
-    case UINT16_FTYPE_UINT16:
+    case UINT64_FTYPE_UINT64:
     case INT64_FTYPE_INT64:
     case INT64_FTYPE_V4SF:
     case INT64_FTYPE_V2DF:
@@ -54588,6 +54705,9 @@ ix86_addr_space_zero_address_valid (addr_space_t a
 #undef TARGET_FOLD_BUILTIN
 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
 
+#undef TARGET_GIMPLE_FOLD_BUILTIN
+#define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
+
 #undef TARGET_COMPARE_VERSION_PRIORITY
 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
 
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 241451)
+++ config/i386/i386.md (working copy)
@@ -174,7 +174,11 @@
   ;; For CRC32 support
   UNSPEC_CRC32
 
+  ;; For LZCNT suppoprt
+  UNSPEC_LZCNT
+
   ;; For BMI support
+  UNSPEC_TZCNT
   UNSPEC_BEXTR
 
   ;; For BMI2 support
@@ -12810,9 +12814,9 @@
 
 (define_expand "ctz<mode>2"
   [(parallel
-    [(set (match_operand:SWI248 0 "register_operand")
-         (ctz:SWI248
-           (match_operand:SWI248 1 "nonimmediate_operand")))
+    [(set (match_operand:SWI48 0 "register_operand")
+         (ctz:SWI48
+           (match_operand:SWI48 1 "nonimmediate_operand")))
      (clobber (reg:CC FLAGS_REG))])])
 
 ; False dependency happens when destination is only updated by tzcnt,
@@ -12860,8 +12864,8 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*ctz<mode>2"
-  [(set (match_operand:SWI248 0 "register_operand" "=r")
-       (ctz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+       (ctz:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
   ""
 {
@@ -12886,15 +12890,78 @@
        (const_string "0")))
    (set_attr "mode" "<MODE>")])
 
+;; Version of tzcnt that is expanded from intrinsics.  This version provides
+;; operand size as output when source operand is zero. 
+
+(define_expand "bmi_tzcnt_<mode>"
+  [(parallel
+    [(set (match_operand:SWI248 0 "register_operand")
+         (unspec:SWI248
+           [(match_operand:SWI248 1 "nonimmediate_operand")]
+           UNSPEC_TZCNT))
+     (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_BMI")
+
+; False dependency happens when destination is only updated by tzcnt,
+; lzcnt or popcnt.  There is no false dependency when destination is
+; also used in source.
+(define_insn_and_split "*bmi_tzcnt_<mode>_falsedep_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+       (unspec:SWI48
+         [(match_operand:SWI48 1 "nonimmediate_operand" "rm")]
+         UNSPEC_TZCNT))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI
+   && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0)
+         (unspec:SWI48 [(match_dup 1)] UNSPEC_TZCNT))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
+     (clobber (reg:CC FLAGS_REG))])]
+{
+  if (!reg_mentioned_p (operands[0], operands[1]))
+    ix86_expand_clear (operands[0]);
+})
+
+(define_insn "*bmi_tzcnt_<mode>_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+       (unspec:SWI48
+         [(match_operand:SWI48 1 "nonimmediate_operand" "rm")]
+         UNSPEC_TZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI"
+  "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*bmi_tzcnt_<mode>"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+       (unspec:SWI248
+         [(match_operand:SWI248 1 "nonimmediate_operand" "rm")]
+         UNSPEC_TZCNT))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI"
+  "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "<MODE>")])
+
 (define_expand "clz<mode>2"
   [(parallel
-     [(set (match_operand:SWI248 0 "register_operand")
-          (minus:SWI248
+     [(set (match_operand:SWI48 0 "register_operand")
+          (minus:SWI48
             (match_dup 2)
-            (clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand"))))
+            (clz:SWI48 (match_operand:SWI48 1 "nonimmediate_operand"))))
       (clobber (reg:CC FLAGS_REG))])
    (parallel
-     [(set (match_dup 0) (xor:SWI248 (match_dup 0) (match_dup 2)))
+     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
 {
@@ -12908,9 +12975,9 @@
 
 (define_expand "clz<mode>2_lzcnt"
   [(parallel
-    [(set (match_operand:SWI248 0 "register_operand")
-         (clz:SWI248
-           (match_operand:SWI248 1 "nonimmediate_operand")))
+    [(set (match_operand:SWI48 0 "register_operand")
+         (clz:SWI48
+           (match_operand:SWI48 1 "nonimmediate_operand")))
      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_LZCNT")
 
@@ -12947,8 +13014,8 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*clz<mode>2_lzcnt"
-  [(set (match_operand:SWI248 0 "register_operand" "=r")
-       (clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+       (clz:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_LZCNT"
   "lzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
@@ -12956,6 +13023,69 @@
    (set_attr "type" "bitmanip")
    (set_attr "mode" "<MODE>")])
 
+;; Version of lzcnt that is expanded from intrinsics.  This version provides
+;; operand size as output when source operand is zero. 
+
+(define_expand "lzcnt_<mode>"
+  [(parallel
+    [(set (match_operand:SWI248 0 "register_operand")
+         (unspec:SWI248
+           [(match_operand:SWI248 1 "nonimmediate_operand")]
+           UNSPEC_LZCNT))
+     (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_LZCNT")
+
+; False dependency happens when destination is only updated by tzcnt,
+; lzcnt or popcnt.  There is no false dependency when destination is
+; also used in source.
+(define_insn_and_split "*lzcnt_<mode>_falsedep_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+       (unspec:SWI48
+         [(match_operand:SWI48 1 "nonimmediate_operand" "rm")]
+         UNSPEC_LZCNT))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_LZCNT
+   && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0)
+         (unspec:SWI48 [(match_dup 1)] UNSPEC_LZCNT))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
+     (clobber (reg:CC FLAGS_REG))])]
+{
+  if (!reg_mentioned_p (operands[0], operands[1]))
+    ix86_expand_clear (operands[0]);
+})
+
+(define_insn "*lzcnt_<mode>_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+       (unspec:SWI48
+         [(match_operand:SWI48 1 "nonimmediate_operand" "rm")]
+         UNSPEC_LZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_LZCNT"
+  "lzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*lzcnt_<mode>"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+       (unspec:SWI248
+         [(match_operand:SWI248 1 "nonimmediate_operand" "rm")]
+         UNSPEC_LZCNT))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_LZCNT"
+  "lzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "<MODE>")])
+
 ;; BMI instructions.
 (define_insn "*bmi_andn_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r,r")
Index: config/i386/lzcntintrin.h
===================================================================
--- config/i386/lzcntintrin.h   (revision 241451)
+++ config/i386/lzcntintrin.h   (working copy)
@@ -38,19 +38,19 @@
 extern __inline unsigned short __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
 __lzcnt16 (unsigned short __X)
 {
-  return __builtin_clzs (__X);
+  return __builtin_ia32_lzcnt_u16 (__X);
 }
 
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 __lzcnt32 (unsigned int __X)
 {
-  return __builtin_clz (__X);
+  return __builtin_ia32_lzcnt_u32 (__X);
 }
 
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _lzcnt_u32 (unsigned int __X)
 {
-  return __builtin_clz (__X);
+  return __builtin_ia32_lzcnt_u32 (__X);
 }
 
 #ifdef __x86_64__
@@ -57,13 +57,13 @@ _lzcnt_u32 (unsigned int __X)
 extern __inline unsigned long long __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
 __lzcnt64 (unsigned long long __X)
 {
-  return __builtin_clzll (__X);
+  return __builtin_ia32_lzcnt_u64 (__X);
 }
 
 extern __inline unsigned long long __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
 _lzcnt_u64 (unsigned long long __X)
 {
-  return __builtin_clzll (__X);
+  return __builtin_ia32_lzcnt_u64 (__X);
 }
 #endif
 
Index: testsuite/gcc.target/i386/bmi-6.c
===================================================================
--- testsuite/gcc.target/i386/bmi-6.c   (revision 241451)
+++ testsuite/gcc.target/i386/bmi-6.c   (working copy)
@@ -1,4 +1,5 @@
 /* { dg-do link } */
+/* { dg-xfail-if "PR 78057" { "*-*-*" } { "*" } { "" } } */
 /* { dg-options "-O2 -mbmi" } */
 
 #include <x86intrin.h>
Index: testsuite/gcc.target/i386/pr78037.c
===================================================================
--- testsuite/gcc.target/i386/pr78037.c (nonexistent)
+++ testsuite/gcc.target/i386/pr78037.c (working copy)
@@ -0,0 +1,21 @@
+/* { dg-do run } */
+/* { dg-require-effective-target bmi } */
+/* { dg-options "-O2 -mbmi" } */
+
+#include <x86intrin.h>
+
+#include "bmi-check.h"
+
+int
+__attribute__((noinline, noclone))
+foo (int x)
+{
+  return __tzcnt_u32 (x) & 0x1f;
+}
+
+static void
+bmi_test ()
+{
+  if (foo (0) != 0)
+    abort ();
+}
Index: testsuite/gcc.target/i386/pr78057.c
===================================================================
--- testsuite/gcc.target/i386/pr78057.c (nonexistent)
+++ testsuite/gcc.target/i386/pr78057.c (working copy)
@@ -0,0 +1,42 @@
+/* PR target/78057 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi -mlzcnt -fdump-tree-optimized" } */
+
+extern void link_error (void);
+
+int
+foo (int x)
+{
+  if (__builtin_ia32_tzcnt_u16 (16) != 4
+      || __builtin_ia32_tzcnt_u16 (0) != 16
+      || __builtin_ia32_lzcnt_u16 (0x1ff) != 7
+      || __builtin_ia32_lzcnt_u16 (0) != 16
+      || __builtin_ia32_tzcnt_u32 (8) != 3
+      || __builtin_ia32_tzcnt_u32 (0) != 32
+      || __builtin_ia32_lzcnt_u32 (0x3fffffff) != 2
+      || __builtin_ia32_lzcnt_u32 (0) != 32
+#ifdef __x86_64__
+      || __builtin_ia32_tzcnt_u64 (4) != 2
+      || __builtin_ia32_tzcnt_u64 (0) != 64
+      || __builtin_ia32_lzcnt_u64 (0x1fffffff) != 35
+      || __builtin_ia32_lzcnt_u64 (0) != 64
+#endif
+     )
+    link_error ();
+  x += 2;
+  if (x == 0)
+    return 5;
+  return __builtin_ia32_tzcnt_u32 (x)
+         + __builtin_ia32_lzcnt_u32 (x)
+#ifdef __x86_64__
+        + __builtin_ia32_tzcnt_u64 (x)
+        + __builtin_ia32_lzcnt_u64 (x)
+#endif
+        ;
+}
+
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "__builtin_ia32_\[lt]zcnt" "optimized" } } 
*/
+/* { dg-final { scan-tree-dump-times "__builtin_ctz " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_clz " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_ctzll " 1 "optimized" { target 
lp64 } } } */

Reply via email to