[PATCH 4/5] LoongArch: New options -mrecip and -mrecip= with ffast-math.

Jiahao Xu Mon, 27 Nov 2023 19:30:34 -0800

When -mrecip option is turned on, use approximate reciprocal instructions and 
approximate
reciprocal square root instructions with additional Newton-Raphson steps to 
implement
single precision floating-point division, square root and reciprocal square 
root operations
for better throughput.


gcc/ChangeLog:

        * config/loongarch/genopts/loongarch.opt.in (recip_mask): New variable.
        (-mrecip, -mrecip): New options.
        * config/loongarch/lasx.md (div<mode>3): New expander.
        (*div<mode>3): Rename.
        (sqrt<mode>2): New expander.
        (*sqrt<mode>2): Rename.
        (rsqrt<mode>2): New expander.
        * config/loongarch/loongarch-protos.h (loongarch_emit_swrsqrtsf): New 
prototype.
        (loongarch_emit_swdivsf): Ditto.
        * config/loongarch/loongarch.cc (loongarch_option_override_internal): 
Set
        recip_mask for -mrecip and -mrecip= options.
        (loongarch_emit_swrsqrtsf): New function.
        (loongarch_emit_swdivsf): Ditto.
        (use_rsqrt_p): Ditto.
        (loongarch_optab_supported_p): Ditto.
        (TARGET_OPTAB_SUPPORTED_P): New hook.
        * config/loongarch/loongarch.h (RECIP_MASK_NONE): New bitmasks.
        (RECIP_MASK_DIV): Ditto.
        (RECIP_MASK_SQRT): Ditto.
        (RECIP_MASK_RSQRT): Ditto.
        (RECIP_MASK_VEC_DIV): Ditto.
        (RECIP_MASK_VEC_SQRT): Ditto.
        (RECIP_MASK_VEC_RSQRT): Ditto.
        (RECIP_MASK_ALL): Ditto.
        (TARGET_RECIP_DIV): New tests.
        (TARGET_RECIP_SQRT): Ditto.
        (TARGET_RECIP_RSQRT): Ditto.
        (TARGET_RECIP_VEC_DIV): Ditto.
        (TARGET_RECIP_VEC_SQRT): Ditto.
        (TARGET_RECIP_VEC_RSQRT): Ditto.
        * config/loongarch/loongarch.md (sqrt<mode>2): New expander.
        (*sqrt<mode>2): Rename.
        (rsqrt<mode>2): New expander.
        * config/loongarch/loongarch.opt (recip_mask): New variable.
        (-mrecip, -mrecip): New options.
        * config/loongarch/lsx.md (div<mode>3): New expander.
        (*div<mode>3): Rename.
        (sqrt<mode>2): New expander.
        (*sqrt<mode>2): Rename.
        (rsqrt<mode>2): New expander.
        * config/loongarch/predicates.md (reg_or_vecotr_1_operand): New 
predicate.
        * doc/invoke.texi (LoongArch Options): Document new options.

gcc/testsuite/ChangeLog:

        * gcc.target/loongarch/recip-divf.c: New test.
        * gcc.target/loongarch/recip-sqrtf.c: New test.
        * gcc.target/loongarch/vector/lasx/lasx-recip-divf.c: New test.
        * gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c: New test.
        * gcc.target/loongarch/vector/lsx/lsx-recip-divf.c: New test.
        * gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c: New test.

diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in 
b/gcc/config/loongarch/genopts/loongarch.opt.in
index 8af6cc6f532..cc1a9daf7cf 100644
--- a/gcc/config/loongarch/genopts/loongarch.opt.in
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
@@ -23,6 +23,9 @@ config/loongarch/loongarch-opts.h
 HeaderInclude
 config/loongarch/loongarch-str.h
 
+TargetVariable
+unsigned int recip_mask = 0
+
 ; ISA related options
 ;; Base ISA
 Enum
@@ -197,6 +200,14 @@ mexplicit-relocs
 Target Var(la_opt_explicit_relocs_backward) Init(M_OPT_UNSET)
 Use %reloc() assembly operators (for backward compatibility).
 
+mrecip
+Target RejectNegative Var(loongarch_recip)
+Generate approximate reciprocal divide and square root for better throughput.
+
+mrecip=
+Target RejectNegative Joined Var(loongarch_recip_name)
+Control generation of reciprocal estimates.
+
 ; The code model option names for -mcmodel.
 Enum
 Name(cmodel) Type(int)
diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index 2d7b8f02b4b..08c81ef53e4 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -1249,7 +1249,25 @@ (define_insn "mul<mode>3"
   [(set_attr "type" "simd_fmul")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+  [(set (match_operand:FLASX 0 "register_operand")
+    (div:FLASX (match_operand:FLASX 1 "reg_or_vecotr_1_operand")
+              (match_operand:FLASX 2 "register_operand")))]
+  "ISA_HAS_LASX"
+{
+  if (<MODE>mode == V8SFmode
+    && TARGET_RECIP_VEC_DIV
+    && optimize_insn_for_speed_p ()
+    && flag_finite_math_only && !flag_trapping_math
+    && flag_unsafe_math_optimizations)
+  {
+    loongarch_emit_swdivsf (operands[0], operands[1],
+       operands[2], V8SFmode);
+    DONE;
+  }
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
        (div:FLASX (match_operand:FLASX 1 "register_operand" "f")
                   (match_operand:FLASX 2 "register_operand" "f")))]
@@ -1278,7 +1296,23 @@ (define_insn "fnma<mode>4"
   [(set_attr "type" "simd_fmadd")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:FLASX 0 "register_operand")
+    (sqrt:FLASX (match_operand:FLASX 1 "register_operand")))]
+  "ISA_HAS_LASX"
+{
+  if (<MODE>mode == V8SFmode
+      && TARGET_RECIP_VEC_SQRT
+      && flag_unsafe_math_optimizations
+      && optimize_insn_for_speed_p ()
+      && flag_finite_math_only && !flag_trapping_math)
+    {
+      loongarch_emit_swrsqrtsf (operands[0], operands[1], V8SFmode, 0);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
        (sqrt:FLASX (match_operand:FLASX 1 "register_operand" "f")))]
   "ISA_HAS_LASX"
@@ -1710,7 +1744,20 @@ (define_insn "lasx_xvfrint_<flasxfmt>"
   [(set_attr "type" "simd_fcvt")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "rsqrt<mode>2"
+(define_expand "rsqrt<mode>2"
+  [(set (match_operand:FLASX 0 "register_operand" "=f")
+    (unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
+            UNSPEC_LASX_XVFRSQRT))]
+  "ISA_HAS_LASX"
+ {
+   if (<MODE>mode == V8SFmode && TARGET_RECIP_VEC_RSQRT)
+     {
+       loongarch_emit_swrsqrtsf (operands[0], operands[1], V8SFmode, 1);
+       DONE;
+     }
+})
+
+(define_insn "*rsqrt<mode>2"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
     (unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
                  UNSPEC_LASX_XVFRSQRT))]
diff --git a/gcc/config/loongarch/loongarch-protos.h 
b/gcc/config/loongarch/loongarch-protos.h
index cb8fc36b086..f2ff93b5e10 100644
--- a/gcc/config/loongarch/loongarch-protos.h
+++ b/gcc/config/loongarch/loongarch-protos.h
@@ -220,5 +220,7 @@ extern rtx loongarch_gen_const_int_vector_shuffle 
(machine_mode, int);
 extern tree loongarch_build_builtin_va_list (void);
 
 extern rtx loongarch_build_signbit_mask (machine_mode, bool, bool);
+extern void loongarch_emit_swrsqrtsf (rtx, rtx, machine_mode, bool);
+extern void loongarch_emit_swdivsf (rtx, rtx, rtx, machine_mode);
 extern bool loongarch_explicit_relocs_p (enum loongarch_symbol_type);
 #endif /* ! GCC_LOONGARCH_PROTOS_H */
diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index d3896d72bc2..afee09c3b61 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -7547,6 +7547,69 @@ loongarch_option_override_internal (struct gcc_options 
*opts,
 
   /* Function to allocate machine-dependent function status.  */
   init_machine_status = &loongarch_init_machine_status;
+
+  /* -mrecip options.  */
+  static struct
+    {
+      const char *string;          /* option name.  */
+      unsigned int mask;           /* mask bits to set.  */
+    }
+  const recip_options[] = {
+       { "all",       RECIP_MASK_ALL },
+       { "none",      RECIP_MASK_NONE },
+       { "div",       RECIP_MASK_DIV },
+       { "sqrt",      RECIP_MASK_SQRT },
+       { "rsqrt",     RECIP_MASK_RSQRT },
+       { "vec-div",   RECIP_MASK_VEC_DIV },
+       { "vec-sqrt",  RECIP_MASK_VEC_SQRT },
+       { "vec-rsqrt", RECIP_MASK_VEC_RSQRT },
+  };
+
+  if (loongarch_recip_name)
+    {
+      char *p = ASTRDUP (loongarch_recip_name);
+      char *q;
+      unsigned int mask, i;
+      bool invert;
+
+      while ((q = strtok (p, ",")) != NULL)
+       {
+         p = NULL;
+         if (*q == '!')
+           {
+             invert = true;
+             q++;
+           }
+         else
+           invert = false;
+
+         if (!strcmp (q, "default"))
+           mask = RECIP_MASK_ALL;
+         else
+           {
+             for (i = 0; i < ARRAY_SIZE (recip_options); i++)
+               if (!strcmp (q, recip_options[i].string))
+                 {
+                   mask = recip_options[i].mask;
+                   break;
+                 }
+
+             if (i == ARRAY_SIZE (recip_options))
+               {
+                 error ("unknown option for %<-mrecip=%s%>", q);
+                 invert = false;
+                 mask = RECIP_MASK_NONE;
+               }
+           }
+
+         if (invert)
+           recip_mask &= ~mask;
+         else
+           recip_mask |= mask;
+       }
+    }
+  if (loongarch_recip)
+    recip_mask |= RECIP_MASK_ALL;
 }
 
 
@@ -11443,6 +11506,156 @@ loongarch_build_signbit_mask (machine_mode mode, bool 
vect, bool invert)
   return force_reg (vec_mode, v);
 }
 
+/* Use rsqrte instruction and Newton-Rhapson to compute the approximation of
+   a single precision floating point [reciprocal] square root.  */
+
+void loongarch_emit_swrsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
+{
+  rtx x0, e0, e1, e2, mhalf, monehalf;
+  REAL_VALUE_TYPE r;
+  int unspec;
+
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  e1 = gen_reg_rtx (mode);
+  e2 = gen_reg_rtx (mode);
+
+  real_arithmetic (&r, ABS_EXPR, &dconsthalf, NULL);
+  mhalf = const_double_from_real_value (r, SFmode);
+
+  real_arithmetic (&r, PLUS_EXPR, &dconsthalf, &dconst1);
+  monehalf = const_double_from_real_value (r, SFmode);
+  unspec = UNSPEC_RSQRTE;
+
+  if (VECTOR_MODE_P (mode))
+    {
+      mhalf = loongarch_build_const_vector (mode, true, mhalf);
+      monehalf = loongarch_build_const_vector (mode, true, monehalf);
+      unspec = GET_MODE_SIZE (mode) == 32 ? UNSPEC_LASX_XVFRSQRTE
+                                         : UNSPEC_LSX_VFRSQRTE;
+    }
+
+  /* rsqrt(a) =  rsqrte(a) * (1.5 - 0.5 * a * rsqrte(a) * rsqrte(a))
+     sqrt(a)  =  a * rsqrte(a) * (1.5 - 0.5 * a * rsqrte(a) * rsqrte(a))  */
+
+  a = force_reg (mode, a);
+
+  /* x0 = rsqrt(a) estimate.  */
+  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+                                             unspec)));
+
+  /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
+  if (!recip)
+    {
+      rtx zero = force_reg (mode, CONST0_RTX (mode));
+
+      if (VECTOR_MODE_P (mode))
+       {
+         machine_mode imode = related_int_vector_mode (mode).require ();
+         rtx mask = gen_reg_rtx (imode);
+         emit_insn (gen_rtx_SET (mask, gen_rtx_NE (imode, a, zero)));
+         emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0,
+                                                  gen_lowpart (mode, mask))));
+       }
+      else
+       {
+         rtx target = emit_conditional_move (x0, { GT, a, zero, mode },
+                                             x0, zero, mode, 0);
+         if (target != x0)
+           emit_move_insn (x0, target);
+       }
+    }
+
+  /* e0 = x0 * a  */
+  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
+  /* e1 = e0 * x0  */
+  emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
+
+  /* e2 = 1.5 - e1 * 0.5  */
+  mhalf = force_reg (mode, mhalf);
+  monehalf = force_reg (mode, monehalf);
+  emit_insn (gen_rtx_SET (e2, gen_rtx_FMA (mode,
+                                          gen_rtx_NEG (mode, e1),
+                                                       mhalf, monehalf)));
+
+  if (recip)
+    /* res = e2 * x0  */
+    emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, x0, e2)));
+  else
+    /* res = e2 * e0  */
+    emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e0)));
+}
+
+/* Use recipe instruction and Newton-Rhapson to compute the approximation of
+   a single precision floating point divide.  */
+
+void loongarch_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
+{
+  rtx x0, e0, mtwo;
+  REAL_VALUE_TYPE r;
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  int unspec = UNSPEC_RECIPE;
+
+  real_arithmetic (&r, ABS_EXPR, &dconst2, NULL);
+  mtwo = const_double_from_real_value (r, SFmode);
+
+  if (VECTOR_MODE_P (mode))
+    {
+      mtwo = loongarch_build_const_vector (mode, true, mtwo);
+      unspec = GET_MODE_SIZE (mode) == 32 ? UNSPEC_LASX_XVFRECIPE
+                                         : UNSPEC_LSX_VFRECIPE;
+    }
+
+  mtwo = force_reg (mode, mtwo);
+
+  /* a / b = a * recipe(b) * (2.0 - b * recipe(b))  */
+
+  /* x0 = 1./b estimate.  */
+  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+                                             unspec)));
+  /* 2.0 - b * x0  */
+  emit_insn (gen_rtx_SET (e0, gen_rtx_FMA (mode,
+                                          gen_rtx_NEG (mode, b), x0, mtwo)));
+
+  /* x0 = a * x0  */
+  if (a != CONST1_RTX (mode))
+    emit_insn (gen_rtx_SET (x0, gen_rtx_MULT (mode, a, x0)));
+
+  /* res = e0 * x0  */
+  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e0, x0)));
+}
+
+/* Return true if it is safe to use the rsqrt optabs to optimize
+   1.0/sqrt.  */
+
+static bool
+use_rsqrt_p (machine_mode mode)
+{
+  if (TARGET_RECIP_RSQRT && GET_MODE_INNER (mode) == SFmode)
+    return (flag_finite_math_only
+           && !flag_trapping_math
+           && flag_unsafe_math_optimizations);
+  else
+    return true;
+}
+
+/* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
+
+static bool
+loongarch_optab_supported_p (int op, machine_mode mode1, machine_mode,
+                            optimization_type opt_type)
+{
+  switch (op)
+    {
+    case rsqrt_optab:
+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
+
+    default:
+      return true;
+    }
+}
+
 static bool
 loongarch_builtin_support_vector_misalignment (machine_mode mode,
                                               const_tree type,
@@ -11610,6 +11823,9 @@ loongarch_asm_code_end (void)
 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
   loongarch_autovectorize_vector_modes
 
+#undef TARGET_OPTAB_SUPPORTED_P
+#define TARGET_OPTAB_SUPPORTED_P loongarch_optab_supported_p
+
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS loongarch_init_builtins
 #undef TARGET_BUILTIN_DECL
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index 115222e70fd..9910084a6a5 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -700,6 +700,24 @@ enum reg_class
    && (GET_MODE_CLASS (MODE) == MODE_VECTOR_INT                \
        || GET_MODE_CLASS (MODE) == MODE_VECTOR_FLOAT))
 
+#define RECIP_MASK_NONE         0x00
+#define RECIP_MASK_DIV          0x01
+#define RECIP_MASK_SQRT         0x02
+#define RECIP_MASK_RSQRT        0x04
+#define RECIP_MASK_VEC_DIV      0x08
+#define RECIP_MASK_VEC_SQRT     0x10
+#define RECIP_MASK_VEC_RSQRT    0x20
+#define RECIP_MASK_ALL (RECIP_MASK_DIV | RECIP_MASK_SQRT \
+                       | RECIP_MASK_RSQRT | RECIP_MASK_VEC_SQRT \
+                       | RECIP_MASK_VEC_DIV | RECIP_MASK_VEC_RSQRT)
+
+#define TARGET_RECIP_DIV        ((recip_mask & RECIP_MASK_DIV) != 0 || 
TARGET_uARCH_LA664)
+#define TARGET_RECIP_SQRT       ((recip_mask & RECIP_MASK_SQRT) != 0 || 
TARGET_uARCH_LA664)
+#define TARGET_RECIP_RSQRT      ((recip_mask & RECIP_MASK_RSQRT) != 0 || 
TARGET_uARCH_LA664)
+#define TARGET_RECIP_VEC_DIV    ((recip_mask & RECIP_MASK_VEC_DIV) != 0 || 
TARGET_uARCH_LA664)
+#define TARGET_RECIP_VEC_SQRT   ((recip_mask & RECIP_MASK_VEC_SQRT) != 0 || 
TARGET_uARCH_LA664)
+#define TARGET_RECIP_VEC_RSQRT  ((recip_mask & RECIP_MASK_VEC_RSQRT) != 0 || 
TARGET_uARCH_LA664)
+
 /* 1 if N is a possible register number for function argument passing.
    We have no FP argument registers when soft-float.  */
 
diff --git a/gcc/config/loongarch/loongarch.md 
b/gcc/config/loongarch/loongarch.md
index 0b6910d84ab..ce4d2d9ad06 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -896,9 +896,21 @@ (define_peephole
 ;; Float division and modulus.
 (define_expand "div<mode>3"
   [(set (match_operand:ANYF 0 "register_operand")
-       (div:ANYF (match_operand:ANYF 1 "reg_or_1_operand")
-                 (match_operand:ANYF 2 "register_operand")))]
-  "")
+    (div:ANYF (match_operand:ANYF 1 "reg_or_1_operand")
+             (match_operand:ANYF 2 "register_operand")))]
+  ""
+{
+  if (<MODE>mode == SFmode
+    && TARGET_RECIP_DIV
+    && optimize_insn_for_speed_p ()
+    && flag_finite_math_only && !flag_trapping_math
+    && flag_unsafe_math_optimizations)
+  {
+    loongarch_emit_swdivsf (operands[0], operands[1],
+       operands[2], SFmode);
+    DONE;
+  }
+})
 
 (define_insn "*div<mode>3"
   [(set (match_operand:ANYF 0 "register_operand" "=f")
@@ -1129,7 +1141,23 @@ (define_insn "*fnma<mode>4"
 ;;
 ;;  ....................
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand")
+    (sqrt:ANYF (match_operand:ANYF 1 "register_operand")))]
+  ""
+ {
+  if (<MODE>mode == SFmode
+      && TARGET_RECIP_SQRT
+      && flag_unsafe_math_optimizations
+      && !optimize_insn_for_size_p ()
+      && flag_finite_math_only && !flag_trapping_math)
+    {
+      loongarch_emit_swrsqrtsf (operands[0], operands[1], SFmode, 0);
+      DONE;
+    }
+ })
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:ANYF 0 "register_operand" "=f")
        (sqrt:ANYF (match_operand:ANYF 1 "register_operand" "f")))]
   ""
@@ -1138,6 +1166,19 @@ (define_insn "sqrt<mode>2"
    (set_attr "mode" "<UNITMODE>")
    (set_attr "insn_count" "1")])
 
+(define_expand "rsqrt<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand")
+    (unspec:ANYF [(match_operand:ANYF 1 "register_operand")]
+          UNSPEC_RSQRT))]
+  "TARGET_HARD_FLOAT"
+{
+   if (<MODE>mode == SFmode && TARGET_RECIP_RSQRT)
+     {
+       loongarch_emit_swrsqrtsf (operands[0], operands[1], SFmode, 1);
+       DONE;
+     }
+})
+
 (define_insn "*rsqrt<mode>2"
   [(set (match_operand:ANYF 0 "register_operand" "=f")
     (unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")]
diff --git a/gcc/config/loongarch/loongarch.opt 
b/gcc/config/loongarch/loongarch.opt
index 4d36e3ec4de..3c4fbda18ee 100644
--- a/gcc/config/loongarch/loongarch.opt
+++ b/gcc/config/loongarch/loongarch.opt
@@ -31,6 +31,9 @@ config/loongarch/loongarch-opts.h
 HeaderInclude
 config/loongarch/loongarch-str.h
 
+TargetVariable
+unsigned int recip_mask = 0
+
 ; ISA related options
 ;; Base ISA
 Enum
@@ -205,6 +208,14 @@ mexplicit-relocs
 Target Var(la_opt_explicit_relocs_backward) Init(M_OPT_UNSET)
 Use %reloc() assembly operators (for backward compatibility).
 
+mrecip
+Target RejectNegative Var(loongarch_recip)
+Generate approximate reciprocal divide and square root for better throughput.
+
+mrecip=
+Target RejectNegative Joined Var(loongarch_recip_name)
+Control generation of reciprocal estimates.
+
 ; The code model option names for -mcmodel.
 Enum
 Name(cmodel) Type(int)
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index 20946326e37..dc78837ecf5 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -1153,7 +1153,25 @@ (define_insn "mul<mode>3"
   [(set_attr "type" "simd_fmul")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+  [(set (match_operand:FLSX 0 "register_operand")
+    (div:FLSX (match_operand:FLSX 1 "reg_or_vecotr_1_operand")
+             (match_operand:FLSX 2 "register_operand")))]
+  "ISA_HAS_LSX"
+{
+  if (<MODE>mode == V4SFmode
+    && TARGET_RECIP_VEC_DIV
+    && optimize_insn_for_speed_p ()
+    && flag_finite_math_only && !flag_trapping_math
+    && flag_unsafe_math_optimizations)
+  {
+    loongarch_emit_swdivsf (operands[0], operands[1],
+       operands[2], V4SFmode);
+    DONE;
+  }
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:FLSX 0 "register_operand" "=f")
        (div:FLSX (match_operand:FLSX 1 "register_operand" "f")
                  (match_operand:FLSX 2 "register_operand" "f")))]
@@ -1182,7 +1200,23 @@ (define_insn "fnma<mode>4"
   [(set_attr "type" "simd_fmadd")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:FLSX 0 "register_operand")
+    (sqrt:FLSX (match_operand:FLSX 1 "register_operand")))]
+  "ISA_HAS_LSX"
+{
+  if (<MODE>mode == V4SFmode
+      && TARGET_RECIP_VEC_SQRT
+      && flag_unsafe_math_optimizations
+      && optimize_insn_for_speed_p ()
+      && flag_finite_math_only && !flag_trapping_math)
+    {
+      loongarch_emit_swrsqrtsf (operands[0], operands[1], V4SFmode, 0);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:FLSX 0 "register_operand" "=f")
        (sqrt:FLSX (match_operand:FLSX 1 "register_operand" "f")))]
   "ISA_HAS_LSX"
@@ -1638,7 +1672,20 @@ (define_insn "lsx_vfrint_<flsxfmt>"
   [(set_attr "type" "simd_fcvt")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "rsqrt<mode>2"
+(define_expand "rsqrt<mode>2"
+  [(set (match_operand:FLSX 0 "register_operand" "=f")
+    (unspec:FLSX [(match_operand:FLSX 1 "register_operand" "f")]
+            UNSPEC_LSX_VFRSQRT))]
+ "ISA_HAS_LSX"
+{
+ if (<MODE>mode == V4SFmode && TARGET_RECIP_VEC_RSQRT)
+   {
+     loongarch_emit_swrsqrtsf (operands[0], operands[1], V4SFmode, 1);
+     DONE;
+   }
+})
+
+(define_insn "*rsqrt<mode>2"
   [(set (match_operand:FLSX 0 "register_operand" "=f")
     (unspec:FLSX [(match_operand:FLSX 1 "register_operand" "f")]
                 UNSPEC_LSX_VFRSQRT))]
diff --git a/gcc/config/loongarch/predicates.md 
b/gcc/config/loongarch/predicates.md
index f7796da10b2..9e9ce58cb53 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -235,6 +235,10 @@ (define_predicate "reg_or_1_operand"
   (ior (match_operand 0 "const_1_operand")
        (match_operand 0 "register_operand")))
 
+(define_predicate "reg_or_vecotr_1_operand"
+  (ior (match_operand 0 "const_vector_1_operand")
+       (match_operand 0 "register_operand")))
+
 ;; These are used in vec_merge, hence accept bitmask as const_int.
 (define_predicate "const_exp_2_operand"
   (and (match_code "const_int")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2e6bac37f3c..fd2a3e2a848 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1209,6 +1209,7 @@ Objective-C and Objective-C++ Dialects}.
 -msoft-float  -mhard-float  -mdouble-float -munordered-float
 -mcmov  -mror  -mrori  -msext  -msfimm  -mshftimm
 -mcmodel=@var{code-model}}
+-mrecip  -mrecip=@var{opt}
 
 @emph{PDP-11 Options}
 @gccoptlist{-mfpu  -msoft-float  -mac0  -mno-ac0  -m40  -m45  -m10
@@ -26523,6 +26524,57 @@ detecting corresponding assembler support:
 This option is mostly useful for debugging, or interoperation with
 assemblers different from the build-time one.
 
+@opindex mrecip
+@item -mrecip
+This option enables use of the reciprocal estimate and reciprocal square
+root estimate instructions with additional Newton-Raphson steps to increase
+precision instead of doing a divide or square root and divide for
+floating-point arguments.
+These instructions are generated only when @option{-funsafe-math-optimizations}
+is enabled together with @option{-ffinite-math-only} and
+@option{-fno-trapping-math}.
+Note that while the throughput of the sequence is higher than the throughput of
+the non-reciprocal instruction, the precision of the sequence can be decreased
+by up to 2 ulp (i.e. the inverse of 1.0 equals 0.99999994).
+
+@opindex mrecip=opt
+@item -mrecip=@var{opt}
+This option controls which reciprocal estimate instructions
+may be used.  @var{opt} is a comma-separated list of options, which may
+be preceded by a @samp{!} to invert the option:
+
+@table @samp
+@item all
+Enable all estimate instructions.
+
+@item default
+Enable the default instructions, equivalent to @option{-mrecip}.
+
+@item none
+Disable all estimate instructions, equivalent to @option{-mno-recip}.
+
+@item div
+Enable the approximation for scalar division.
+
+@item vec-div
+Enable the approximation for vectorized division.
+
+@item sqrt
+Enable the approximation for scalar square root.
+
+@item vec-sqrt
+Enable the approximation for vectorized square root.
+
+@item rsqrt
+Enable the approximation for scalar reciprocal square root.
+
+@item vec-rsqrt
+Enable the approximation for vectorized reciprocal square root.
+@end table
+
+So, for example, @option{-mrecip=all,!sqrt} enables
+all of the reciprocal approximations, except for scalar square root.
+
 @item loongarch-vect-unroll-limit
 The vectorizer will use available tuning information to determine whether it
 would be beneficial to unroll the main vectorized loop and by how much.  This
diff --git a/gcc/testsuite/gcc.target/loongarch/recip-divf.c 
b/gcc/testsuite/gcc.target/loongarch/recip-divf.c
new file mode 100644
index 00000000000..82b3224a250
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/recip-divf.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -march=la664" } */
+/* { dg-final { scan-assembler "frecipe.s" } } */
+
+float
+foo(float a, float b)
+{
+  return a / b;
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/recip-sqrtf.c 
b/gcc/testsuite/gcc.target/loongarch/recip-sqrtf.c
new file mode 100644
index 00000000000..23ceb4cd261
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/recip-sqrtf.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -march=la664" } */
+/* { dg-final { scan-assembler-times "frsqrte.s" 3 } } */
+
+extern float sqrtf (float);
+
+float
+foo1 (float a, float b)
+{
+  return a/sqrtf(b);
+}
+
+float
+foo2 (float a, float b)
+{
+  return sqrtf(a/b);
+}
+
+float
+foo3 (float a)
+{
+  return sqrtf(a);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-divf.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-divf.c
new file mode 100644
index 00000000000..6ca72a1ce81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-divf.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlasx -march=la664" } */
+/* { dg-final { scan-assembler "xvfrecipe.s" } } */
+
+float a[8],b[8],c[8];
+
+void 
+foo ()
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = a[i] / b[i];
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c
new file mode 100644
index 00000000000..5f1c4e3d164
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlasx -march=la664" } */
+/* { dg-final { scan-assembler-times "xvfrsqrte.s" 3 } } */
+
+float a[8], b[8], c[8];
+
+extern float sqrtf (float);
+
+void
+foo1 (void)
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = a[i] / sqrtf (b[i]);
+}
+
+void
+foo2 (void)
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = sqrtf (a[i] / b[i]);
+}
+
+void
+foo3 (void)
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = sqrtf (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-divf.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-divf.c
new file mode 100644
index 00000000000..015dafb50f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-divf.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlsx -march=la664" } */
+/* { dg-final { scan-assembler "vfrecipe.s" } } */
+
+float a[4],b[4],c[4];
+
+void
+foo ()
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = a[i] / b[i];
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c
new file mode 100644
index 00000000000..5a1a14e291c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlsx -march=la664" } */
+/* { dg-final { scan-assembler-times "vfrsqrte.s" 3 } } */
+
+float a[4], b[4], c[4];
+
+extern float sqrtf (float);
+
+void
+foo1 (void)
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = a[i] / sqrtf (b[i]);
+}
+
+void
+foo2 (void)
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = sqrtf (a[i] / b[i]);
+}
+
+void
+foo3 (void)
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = sqrtf (a[i]);
+}
-- 
2.20.1

[PATCH 4/5] LoongArch: New options -mrecip and -mrecip= with ffast-math.

Reply via email to