[PATCH v1 1/2] LoongArch: Optimize immediate load.

Lulu Cheng Sat, 29 Oct 2022 00:11:20 -0700

Fixed an issue where the compiler would not take four 64-bit immediate
load instructions out of the loop.


gcc/ChangeLog:

        * config/loongarch/constraints.md (x): New constraint.
        * config/loongarch/loongarch.cc (struct loongarch_integer_op):
        Define a new member curr_value, that records the value of
        the number stored in the destination register immediately
        after the current instruction has run.
        (loongarch_build_integer): Adds a method to load the immediate
        32-bit to 63-bit field.
        (loongarch_move_integer): Same as above.
        * config/loongarch/loongarch.h (HWIT_UC_0xFFFFFFFF):
        (HI32_OPERAND): NEW macro.
        * config/loongarch/loongarch.md (load_hi32):New template.
        * config/loongarch/predicates.md (const_hi32_operand): Determines
        whether the value is an immediate number that has a value of only
        the higher 32 bits.
        (hi32_mask_operand): Immediately counts the mask of 32 to 61 bits.

gcc/testsuite/ChangeLog:

        * gcc.target/loongarch/imm-load.c: New test.
---
 gcc/config/loongarch/constraints.md           |  7 +-
 gcc/config/loongarch/loongarch.cc             | 95 ++++++++++++-------
 gcc/config/loongarch/loongarch.h              |  6 ++
 gcc/config/loongarch/loongarch.md             | 26 +++++
 gcc/config/loongarch/predicates.md            |  8 ++
 gcc/testsuite/gcc.target/loongarch/imm-load.c | 25 +++++
 6 files changed, 133 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c

diff --git a/gcc/config/loongarch/constraints.md 
b/gcc/config/loongarch/constraints.md
index 43cb7b5f0f5..1dcf09ce5eb 100644
--- a/gcc/config/loongarch/constraints.md
+++ b/gcc/config/loongarch/constraints.md
@@ -46,7 +46,7 @@
 ;; "u" "A signed 52bit constant and low 32-bit is zero (for logic 
instructions)"
 ;; "v" "A signed 64-bit constant and low 44-bit is zero (for logic 
instructions)."
 ;; "w" "Matches any valid memory."
-;; "x" <-----unused
+;; "x" "A signed 64-bit constant and low 32-bit is zero (for logic 
instructions)."
 ;; "y" <-----unused
 ;; "z" FCC_REGS
 ;; "A" <-----unused
@@ -139,6 +139,11 @@ (define_constraint "v"
   (and (match_code "const_int")
        (match_test "LU52I_OPERAND (ival)")))
 
+(define_constraint "x"
+  "A signed 64-bit constant and low 32-bit is zero (for logic instructions)."
+  (and (match_code "const_int")
+       (match_test "HI32_OPERAND (ival)")))
+
 (define_register_constraint "z" "FCC_REGS"
   "A floating-point condition code register.")
 
diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index f54c233f90c..5e8cd293645 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -139,6 +139,9 @@ struct loongarch_address_info
    METHOD_LU52I:
      Load 52-63 bit of the immediate number.
 
+   METHOD_LD_HI32:
+     Load 32-63 bit of the immediate number.
+
    METHOD_INSV:
      immediate like 0xfff00000fffffxxx
    */
@@ -147,13 +150,18 @@ enum loongarch_load_imm_method
   METHOD_NORMAL,
   METHOD_LU32I,
   METHOD_LU52I,
+  METHOD_LD_HI32,
   METHOD_INSV
 };
 
 struct loongarch_integer_op
 {
   enum rtx_code code;
+  /* Current Immediate Count The immediate count of the load instruction.  */
   HOST_WIDE_INT value;
+  /* Represent the result　of the immediate count of the load instruction at
+     each step.  */
+  HOST_WIDE_INT curr_value;
   enum loongarch_load_imm_method method;
 };
 
@@ -1474,24 +1482,27 @@ loongarch_build_integer (struct loongarch_integer_op 
*codes,
     {
       /* The value of the lower 32 bit be loaded with one instruction.
         lu12i.w.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part;
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part;
+      codes[cost].curr_value = low_part;
       cost++;
     }
   else
     {
       /* lu12i.w + ior.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].curr_value = codes[cost].value;
       cost++;
       HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
       if (iorv != 0)
        {
-         codes[1].code = IOR;
-         codes[1].method = METHOD_NORMAL;
-         codes[1].value = iorv;
+         codes[cost].code = IOR;
+         codes[cost].method = METHOD_NORMAL;
+         codes[cost].value = iorv;
+         codes[cost].curr_value = low_part;
          cost++;
        }
     }
@@ -1514,23 +1525,34 @@ loongarch_build_integer (struct loongarch_integer_op 
*codes,
        {
          codes[cost].method = METHOD_LU52I;
          codes[cost].value = value & LU52I_B;
-         return cost + 1;
+         codes[cost].curr_value = codes[cost].value | 
(codes[cost-1].curr_value &
+                                                       0xfffffffffffff);
+         return cost++;
        }
 
-      codes[cost].method = METHOD_LU32I;
-      codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
-      cost++;
-
-      /* Determine whether the 52-61 bits are sign-extended from the low order,
-        and if not, load the 52-61 bits.  */
-      if (!lu52i[(value & (HOST_WIDE_INT_1U << 51)) >> 51])
+      if (lu52i[sign51])
        {
-         codes[cost].method = METHOD_LU52I;
-         codes[cost].value = value & LU52I_B;
+         /* Determine whether the 52-61 bits are sign-extended from the low 
order.
+            If so, the 52-61 bits of the immediate number do not need to be 
loaded.
+         */
+         codes[cost].method = METHOD_LU32I;
+         codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
+         codes[cost].curr_value = codes[cost].value | 
(codes[cost-1].curr_value &
+                                                       0xffffffff);
+         cost++;
+       }
+      else
+       {
+         /* If the higher 32 bits of the 64bit immediate need to be loaded
+            separately by two instructions, a false immediate load instruction
+            load_hi32 is used to load them.  */
+         codes[cost].method = METHOD_LD_HI32;
+         codes[cost].value = value & 0xffffffff00000000;
+         codes[cost].curr_value = codes[cost].value | 
(codes[cost-1].curr_value &
+                                                       0xffffffff);
          cost++;
        }
     }
-
   gcc_assert (cost <= LARCH_MAX_INTEGER_OPS);
 
   return cost;
@@ -2910,30 +2932,37 @@ loongarch_move_integer (rtx temp, rtx dest, unsigned 
HOST_WIDE_INT value)
       else
        x = force_reg (mode, x);
 
+      set_unique_reg_note (get_last_insn (), REG_EQUAL, GEN_INT 
(codes[i-1].curr_value));
+
       switch (codes[i].method)
        {
        case METHOD_NORMAL:
+         /* mov or ior.  */
          x = gen_rtx_fmt_ee (codes[i].code, mode, x,
                              GEN_INT (codes[i].value));
          break;
        case METHOD_LU32I:
-         emit_insn (
-           gen_rtx_SET (x,
-                        gen_rtx_IOR (DImode,
-                                     gen_rtx_ZERO_EXTEND (
-                                       DImode, gen_rtx_SUBREG (SImode, x, 0)),
-                                     GEN_INT (codes[i].value))));
+         gcc_assert (mode == DImode);
+         /* lu32i_d */
+         x = gen_rtx_IOR (mode, gen_rtx_ZERO_EXTEND (mode,
+                                               gen_rtx_SUBREG (SImode, x, 0)),
+                          GEN_INT (codes[i].value));
          break;
        case METHOD_LU52I:
-         emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
-                                 GEN_INT (codes[i].value)));
+         gcc_assert (mode == DImode);
+         /* lu52i_d */
+         x = gen_rtx_IOR (mode, gen_rtx_AND (mode, x, GEN_INT 
(0xfffffffffffff)),
+                          GEN_INT (codes[i].value));
          break;
-       case METHOD_INSV:
-         emit_insn (
-           gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT (20),
-                                              GEN_INT (32)),
-                        gen_rtx_REG (DImode, 0)));
+       case METHOD_LD_HI32:
+         /* Load the high 32 bits of the immediate number.  */
+         gcc_assert (mode == DImode);
+         /* load_hi32 */
+         x = gen_rtx_IOR (mode, gen_rtx_AND (mode, x, GEN_INT (0xffffffff)),
+                          GEN_INT (codes[i].value));
          break;
+       case METHOD_INSV:
+         /* It is not currently implemented.  */
        default:
          gcc_unreachable ();
        }
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index f4a9c329fef..cfc046f546e 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -605,6 +605,12 @@ enum reg_class
 #define LU52I_OPERAND(VALUE) \
   (((VALUE) | (HWIT_UC_0xFFF << 52)) == (HWIT_UC_0xFFF << 52))
 
+/* True if VALUE can be loaded into a register using load_hi32.  */
+
+#define HWIT_UC_0xFFFFFFFF HOST_WIDE_INT_UC(0xffffffff)
+#define HI32_OPERAND(VALUE) \
+  (((VALUE) | (HWIT_UC_0xFFFFFFFF << 32)) == (HWIT_UC_0xFFFFFFFF << 32))
+
 /* Return a value X with the low 12 bits clear, and such that
    VALUE - X is a signed 12-bit value.  */
 
diff --git a/gcc/config/loongarch/loongarch.md 
b/gcc/config/loongarch/loongarch.md
index 214b14bddd3..7eaa9ab66e3 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -1882,6 +1882,32 @@ (define_expand "mov<mode>cc"
   DONE;
 })
 
+(define_insn_and_split "load_hi32"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (ior:DI
+         (and:DI (match_operand:DI 1 "register_operand" "0")
+                 (match_operand 2 "hi32_mask_operand"))
+       (match_operand 3 "const_hi32_operand" "x")))]
+  "TARGET_64BIT"
+  "#"
+  ""
+  [(set (match_dup 0)
+        (ior:DI
+          (zero_extend:DI
+            (subreg:SI (match_dup 1) 0))
+          (match_dup 4)))
+   (set (match_dup 0)
+        (ior:DI
+          (and:DI (match_dup 0)
+                  (match_dup 6))
+          (match_dup 5)))]
+{
+  operands[4] = GEN_INT (INTVAL (operands[3]) << 12 >> 12);
+  operands[5] = GEN_INT (INTVAL (operands[3]) & 0xfff0000000000000);
+  operands[6] = GEN_INT (0xfffffffffffff);
+}
+  [(set_attr "insn_count" "2")])
+
 (define_insn "lu32i_d"
   [(set (match_operand:DI 0 "register_operand" "=r")
        (ior:DI
diff --git a/gcc/config/loongarch/predicates.md 
b/gcc/config/loongarch/predicates.md
index 8bd0c1376c9..29d81ff0250 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -35,6 +35,10 @@ (define_predicate "const_lu52i_operand"
   (and (match_code "const_int")
        (match_test "LU52I_OPERAND (INTVAL (op))")))
 
+(define_predicate "const_hi32_operand"
+  (and (match_code "const_int")
+       (match_test "HI32_OPERAND (INTVAL (op))")))
+
 (define_predicate "const_arith_operand"
   (and (match_code "const_int")
        (match_test "IMM12_OPERAND (INTVAL (op))")))
@@ -103,6 +107,10 @@ (define_predicate "lu52i_mask_operand"
   (and (match_code "const_int")
        (match_test "UINTVAL (op) == 0xfffffffffffff")))
 
+(define_predicate "hi32_mask_operand"
+  (and (match_code "const_int")
+       (match_test "UINTVAL (op) == 0xffffffff")))
+
 (define_predicate "low_bitmask_operand"
   (and (match_code "const_int")
        (match_test "low_bitmask_len (mode, INTVAL (op)) > 12")))
diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c 
b/gcc/testsuite/gcc.target/loongarch/imm-load.c
new file mode 100644
index 00000000000..91ceb33d058
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-loop2_invariant" } */
+
+extern long long b[10];
+static inline long long
+repeat_bytes (void)
+{
+  long long r = 0x0101010101010101;
+
+  return r;
+}
+
+static inline long long
+highbit_mask (long long m)
+{
+  return m & repeat_bytes ();
+}
+
+void test(long long *a)
+{
+  for (int i = 0; i < 10; i++)
+    b[i] = highbit_mask (a[i]);
+
+}
+/* { dg-final { scan-rtl-dump-times "moved without introducing a new temporary 
register" 4 "loop2_invariant" } } */
-- 
2.31.1

[PATCH v1 1/2] LoongArch: Optimize immediate load.

Reply via email to