Hi Uros,
This revision implements your suggestions/refinements. (i) Avoid the
UNSPEC_CMC by using the canonical RTL idiom for *x86_cmc, (ii) Use
peephole2s to convert x86_stc and *x86_cmc into alternate forms on
TARGET_SLOW_STC CPUs (pentium4), when a suitable QImode register is
available, (iii) Prefer the addqi_cconly_overflow idiom (addb $-1,%al)
over negqi_ccc_1 (neg %al) for setting the carry from a QImode value,
(iv) Use andl %eax,%eax to clear carry flag without requiring (clobbering)
an additional register, as an alternate output template for *x86_clc.
These changes required two minor edits to i386.cc:  ix86_cc_mode had
to be tweaked to suggest CCCmode for the new *x86_cmc pattern, and
*x86_cmc needed to be handled/parameterized in ix86_rtx_costs so that
combine would appreciate that this complex RTL expression was actually
a fast, single byte instruction [i.e. preferable].

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?

2022-06-06  Roger Sayle  <ro...@nextmovesoftware.com>
            Uros Bizjak  <ubiz...@gmail.com>

gcc/ChangeLog
        * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
        Use new x86_stc instruction when the carry flag must be set.
        * config/i386/i386.cc (ix86_cc_mode): Use CCCmode for *x86_cmc.
        (ix86_rtx_costs): Provide accurate rtx_costs for *x86_cmc.
        * config/i386/i386.h (TARGET_SLOW_STC): New define.
        * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
        (UNSPEC_STC): New UNSPEC for stc.
        (*x86_clc): New define_insn (with implementation for pentium4).
        (x86_stc): New define_insn.
        (define_peephole2): Convert x86_stc into alternate implementation
        on pentium4 without -Os when a QImode register is available.
        (*x86_cmc): New define_insn.
        (define_peephole2): Convert *x86_cmc into alternate implementation
        on pentium4 without -Os when a QImode register is available.
        (*setccc): New define_insn_and_split for a no-op CCCmode move.
        (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
        recognize (and eliminate) the carry flag being copied to itself.
        (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
        * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.

gcc/testsuite/ChangeLog
        * gcc.target/i386/cmc-1.c: New test case.
        * gcc.target/i386/stc-1.c: Likewise.


Thanks, Roger.
--

-----Original Message-----
From: Uros Bizjak <ubiz...@gmail.com> 
Sent: 04 June 2023 18:53
To: Roger Sayle <ro...@nextmovesoftware.com>
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [x86 PATCH] Add support for stc, clc and cmc instructions in 
i386.md

On Sun, Jun 4, 2023 at 12:45 AM Roger Sayle <ro...@nextmovesoftware.com> wrote:
>
>
> This patch is the latest revision of my patch to add support for the 
> STC (set carry flag), CLC (clear carry flag) and CMC (complement carry 
> flag) instructions to the i386 backend, incorporating Uros'
> previous feedback.  The significant changes are (i) the inclusion of 
> CMC, (ii) the use of UNSPEC for pattern, (iii) Use of a new 
> X86_TUNE_SLOW_STC tuning flag to use alternate implementations on
> pentium4 (which has a notoriously slow STC) when not optimizing for 
> size.
>
> An example of the use of the stc instruction is:
> unsigned int foo (unsigned int a, unsigned int b, unsigned int *c) {
>   return __builtin_ia32_addcarryx_u32 (1, a, b, c); }
>
> which previously generated:
>         movl    $1, %eax
>         addb    $-1, %al
>         adcl    %esi, %edi
>         setc    %al
>         movl    %edi, (%rdx)
>         movzbl  %al, %eax
>         ret
>
> with this patch now generates:
>         stc
>         adcl    %esi, %edi
>         setc    %al
>         movl    %edi, (%rdx)
>         movzbl  %al, %eax
>         ret
>
> An example of the use of the cmc instruction (where the carry from a 
> first adc is inverted/complemented as input to a second adc) is:
> unsigned int bar (unsigned int a, unsigned int b,
>                   unsigned int c, unsigned int d) {
>   unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
>   return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2); }
>
> which previously generated:
>         movl    $1, %eax
>         addb    $-1, %al
>         adcl    %esi, %edi
>         setnc   %al
>         movl    %edi, o1(%rip)
>         addb    $-1, %al
>         adcl    %ecx, %edx
>         setc    %al
>         movl    %edx, o2(%rip)
>         movzbl  %al, %eax
>         ret
>
> and now generates:
>         stc
>         adcl    %esi, %edi
>         cmc
>         movl    %edi, o1(%rip)
>         adcl    %ecx, %edx
>         setc    %al
>         movl    %edx, o2(%rip)
>         movzbl  %al, %eax
>         ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap 
> and make -k check, both with and without --target_board=unix{-m32} 
> with no new failures.  Ok for mainline?
>
>
> 2022-06-03  Roger Sayle  <ro...@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
>         Use new x86_stc or negqi_ccc_1 instructions to set the carry flag.
>         * config/i386/i386.h (TARGET_SLOW_STC): New define.
>         * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
>         (UNSPEC_STC): New UNSPEC for stc.
>         (UNSPEC_CMC): New UNSPEC for cmc.
>         (*x86_clc): New define_insn.
>         (*x86_clc_xor): New define_insn for pentium4 without -Os.
>         (x86_stc): New define_insn.
>         (define_split): Convert x86_stc into alternate implementation
>         on pentium4.
>         (x86_cmc): New define_insn.
>         (*x86_cmc_1): New define_insn_and_split to recognize cmc pattern.
>         (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
>         recognize (and eliminate) the carry flag being copied to itself.
>         (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
>         (neg<mode>_ccc_1): Renamed from *neg<mode>_ccc_1 for gen function.
>         * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/cmc-1.c: New test case.
>         * gcc.target/i386/stc-1.c: Likewise.

+;; Clear carry flag.
+(define_insn "*x86_clc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))]
+  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
+  "clc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+(define_insn "*x86_clc_xor"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))
+   (clobber (match_scratch:SI 0 "=r"))]
+  "TARGET_SLOW_STC && !optimize_function_for_size_p (cfun)"
+  "xor{l}\t%0, %0"
+    [(set_attr "type" "alu1")
+     (set_attr "mode" "SI")
+     (set_attr "length_immediate" "0")])

I think the above would be better implemented as a peephole2 pattern that 
triggers when a register is available. We should not waste a register on a 
register starved x86_32 just to set a carry flag. This is implemented with:

  [(match_scratch:SI 2 "r")

at the beginning of the peephole2 pattern that generates x86_clc_xor.
The pattern should be constrained with "TARGET_SLOW_STC && 
!optimize_function_for_size_p()" and x86_clc_xor should be available only after 
reload (like e.g. "*mov<mode>_xor").

+;; On Pentium 4, set the carry flag using mov $1,%al;neg %al.
+(define_split
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  "TARGET_SLOW_STC
+   && !optimize_insn_for_size_p ()
+   && can_create_pseudo_p ()"
+  [(set (match_dup 0) (const_int 1))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+       (unspec:CCC [(match_dup 0) (const_int 0)] UNSPEC_CC_NE))
+      (set (match_dup 0) (neg:QI (match_dup 0)))])]
+  "operands[0] = gen_reg_rtx (QImode);")

Same here, this should be a peephole2 to trigger only when a register is 
available, again for "TARGET_SLOW_STC && !optimize_function_for_size_p()".

Is the new sequence "mov $1,%al; neg %al" any better than existing "mov $1,%al; 
addb $-1,%al" or is there another reason for the changed sequence?

+(define_insn_and_split "*x86_cmc_1"
+  [(set (reg:CCC FLAGS_REG)
+        (unspec:CCC [(geu:QI (reg:CCC FLAGS_REG) (const_int 0))
+             (const_int 0)] UNSPEC_CC_NE))]
+  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(reg:CCC FLAGS_REG)] 
+UNSPEC_CMC))])

The above should be the RTL model of x86_cmc. No need to split to an unspec.

Uros.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ac67441..697eb47 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -13948,8 +13948,6 @@ rdseed_step:
       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
 
       op1 = expand_normal (arg0);
-      if (!integer_zerop (arg0))
-       op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
 
       op2 = expand_normal (arg1);
       if (!register_operand (op2, mode0))
@@ -13967,7 +13965,7 @@ rdseed_step:
        }
 
       op0 = gen_reg_rtx (mode0);
-      if (integer_zerop (arg0))
+      if (op1 == const0_rtx)
        {
          /* If arg0 is 0, optimize right away into add or sub
             instruction that sets CCCmode flags.  */
@@ -13977,7 +13975,14 @@ rdseed_step:
       else
        {
          /* Generate CF from input operand.  */
-         emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+         if (!CONST_INT_P (op1))
+           {
+             op1 = convert_to_mode (QImode, op1, 1);
+             op1 = copy_to_mode_reg (QImode, op1);
+             emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+           }
+         else
+           emit_insn (gen_x86_stc ());
 
          /* Generate instruction that consumes CF.  */
          op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56e..c4591d6 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15954,6 +15954,17 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
               && REGNO (XEXP (op1, 0)) == FLAGS_REG
               && XEXP (op1, 1) == const0_rtx)
        return CCCmode;
+      /* Similarly for *x86_cmc pattern.
+        Match LTU of op0 (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+        and op1 (geu:QI (reg:CCC FLAGS_REG) (const_int 0)).
+        It is sufficient to test that the operand modes are CCCmode.  */
+      else if (code == LTU
+              && GET_CODE (op0) == NEG
+              && GET_CODE (XEXP (op0, 0)) == LTU
+              && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
+              && GET_CODE (op1) == GEU
+              && GET_MODE (XEXP (op1, 0)) == CCCmode)
+       return CCCmode;
       else
        return CCmode;
     case GTU:                  /* CF=0 & ZF=0 */
@@ -21305,6 +21316,31 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
          *total = 0;
          return true;
        }
+      /* Match x
+        (compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+                     (geu:QI (reg:CCC FLAGS_REG) (const_int 0)))  */
+      if (mode == CCCmode
+         && GET_CODE (op0) == NEG
+         && GET_CODE (XEXP (op0, 0)) == LTU
+         && REG_P (XEXP (XEXP (op0, 0), 0))
+         && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
+         && REGNO (XEXP (XEXP (op0, 0), 0)) == FLAGS_REG
+         && XEXP (XEXP (op0, 0), 1) == const0_rtx
+         && GET_CODE (op1) == GEU
+         && REG_P (XEXP (op1, 0))
+         && GET_MODE (XEXP (op1, 0)) == CCCmode
+         && REGNO (XEXP (op1, 0)) == FLAGS_REG
+         && XEXP (op1, 1) == const0_rtx)
+       {
+         /* This is *x86_cmc.  */
+         if (!speed)
+           *total = COSTS_N_BYTES (1);
+         else if (TARGET_SLOW_STC)
+           *total = COSTS_N_INSNS (2);
+         else 
+           *total = COSTS_N_INSNS (1);
+         return true;
+       }
 
       if (SCALAR_INT_MODE_P (GET_MODE (op0))
          && GET_MODE_SIZE (GET_MODE (op0)) > UNITS_PER_WORD)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c7439f8..5ac9c78 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -448,6 +448,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
        ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
 #define TARGET_DEST_FALSE_DEP_FOR_GLC \
        ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
+#define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e6ebc46..bc48fc4 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -114,6 +114,8 @@
   UNSPEC_INSN_FALSE_DEP
   UNSPEC_SBB
   UNSPEC_CC_NE
+  UNSPEC_CLC
+  UNSPEC_STC
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -1999,6 +2001,71 @@
   [(set_attr "type" "ssecomi")
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
+
+;; Clear carry flag.
+(define_insn "*x86_clc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))]
+  ""
+{
+  if (TARGET_SLOW_STC && !optimize_function_for_size_p (cfun))
+    return "and{l}\t{%%rax, %%rax|rax, rax}";
+  return "clc";
+}
+  [(set (attr "length")
+       (if_then_else
+         (and (match_test "TARGET_SLOW_STC")
+              (not (match_test "optimize_function_for_size_p (cfun)")))
+         (const_int 2)
+         (const_int 1)))
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; Set carry flag.
+(define_insn "x86_stc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  ""
+  "stc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, set the carry flag using mov $1,%al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (const_int 1))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+          (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+                       (match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
+
+;; Complement carry flag.
+(define_insn "*x86_cmc"
+  [(set (reg:CCC FLAGS_REG)
+       (compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+                    (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  ""
+  "cmc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "use_carry" "1")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, cmc is replaced with setnc %al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG)
+       (compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+                    (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+          (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+                       (match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
 
 ;; Push/pop instructions.
 
@@ -8107,6 +8174,34 @@
   "#"
   "&& 1"
   [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setccc"
+  [(set (reg:CCC FLAGS_REG)
+       (reg:CCC FLAGS_REG))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_1_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+       (ltu:CCC (reg:CC_CCC FLAGS_REG) (const_int 0)))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_2_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+       (unspec:CCC [(ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))
+                    (const_int 0)] UNSPEC_CC_NE))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
 
 ;; Overflow setting add instructions
 
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index e1c72cd..c3229d2 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -698,3 +698,7 @@ DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 
m_NONE)
 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
    before a transfer of control flow out of the function.  */
 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
+
+/* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
+  modifications on architectures where theses operations are slow.  */
+DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
diff --git a/gcc/testsuite/gcc.target/i386/cmc-1.c 
b/gcc/testsuite/gcc.target/i386/cmc-1.c
new file mode 100644
index 0000000..58e922a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cmc-1.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned int o1;
+unsigned int o2;
+
+unsigned int foo_xor (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2);
+}
+
+unsigned int foo_sub (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (1 - c1, c, d, &o2);
+}
+
+unsigned int foo_eqz (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (c1 == 0, c, d, &o2);
+}
+
+/* { dg-final { scan-assembler "cmc" } } */
diff --git a/gcc/testsuite/gcc.target/i386/stc-1.c 
b/gcc/testsuite/gcc.target/i386/stc-1.c
new file mode 100644
index 0000000..857c939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/stc-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef unsigned int u32;
+
+unsigned int foo (unsigned int a, unsigned int b, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, a, b, c);
+}
+
+unsigned int bar (unsigned int b, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, 2, b, c);
+}
+
+unsigned int baz (unsigned int a, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, a, 3, c);
+}
+
+/* { dg-final { scan-assembler "stc" } } */

Reply via email to