Hello,

just posting the current version of this patch, in case people have comments.

Some changes: the inline asm is introduced during expansion, and the thing is controlled by a different flag (it should be controlled by the pragma, but that's starting to be too many pieces to implement at the same time, and I didn't want to cause a regression for people using -frounding-math in the case where it actually works). I also added an extra parameter, currently always 0, to specify some properties of the operation : the first one I am thinking of is "don't care about exceptions" since I only care about rounding, but that will require even more flags / pragmas to specify the variants we want...

For the inline asm, I hesitated between building a temporary GIMPLE_ASM just so I could pass it to the existing expansion, or "inlining" a simplified version. This version always goes through the stack, which matches well with the constraint "=m". One would have to modify the code to allow "=x". Using "=mx", the compiler does simplify things so we actually go through registers (it randomly leaves a dead store to the stack here or there, but not that many and it looks like an existing missed optimization), which makes me think it is not that important to write specific code to handle "=x".

Some possible future work:
- target hook to specify a constraint different from "=m"
- target hook to expand the functions and/or the opaque pass-through
- more operations (maybe comparisons, conversions, etc)
- lowering generic vector operations, so I can enable them in the front-end
- parsing the pragma
- optimizations (at least exact constant folding)
- constexpr? Disable in some contexts where a dynamic rounding mode makes less sense?
- C front-end
- Use caller's environment for always_inline callee? We would have to mark the call so we remember what the environment was, and it would be too late for some foldings, but we could still translate the operations that remain, which should be sufficient for the x86 *intrin.h files. To be safe we would have to assume fenv_access on for always_inline functions and only lower them to regular operations when we see the caller, but that might be too much.

--
Marc Glisse
Index: gcc/c-family/c.opt
===================================================================
--- gcc/c-family/c.opt	(revision 274007)
+++ gcc/c-family/c.opt	(working copy)
@@ -1495,20 +1495,24 @@ C ObjC C++ ObjC++ Joined RejectNegative
 -fexec-charset=<cset>	Convert all strings and character constants to character set <cset>.
 
 fextended-identifiers
 C ObjC C++ ObjC++
 Permit universal character names (\\u and \\U) in identifiers.
 
 finput-charset=
 C ObjC C++ ObjC++ Joined RejectNegative
 -finput-charset=<cset>	Specify the default character set for source files.
 
+ffenv-access
+C++ ObjC++ Var(flag_fenv_access)
+Assume #pragma STDC FENV_ACCESS ON at the beginning of the translation unit (experimental and incomplete support).
+
 fextern-tls-init
 C++ ObjC++ Var(flag_extern_tls_init) Init(-1)
 Support dynamic initialization of thread-local variables in a different translation unit.
 
 fexternal-templates
 C++ ObjC++ Deprecated
 
 ffor-scope
 C++ ObjC++ Deprecated
 
Index: gcc/cp/call.c
===================================================================
--- gcc/cp/call.c	(revision 274007)
+++ gcc/cp/call.c	(working copy)
@@ -9137,21 +9137,33 @@ maybe_warn_class_memaccess (location_t l
    high-level operations.  */
 
 tree
 build_cxx_call (tree fn, int nargs, tree *argarray,
 		tsubst_flags_t complain)
 {
   tree fndecl;
 
   /* Remember roughly where this call is.  */
   location_t loc = cp_expr_loc_or_loc (fn, input_location);
-  fn = build_call_a (fn, nargs, argarray);
+
+  if (flag_fenv_access && TREE_CODE (fn) == ADDR_EXPR
+      && (fndecl_built_in_p (TREE_OPERAND (fn, 0), BUILT_IN_SQRT)
+	  || fndecl_built_in_p (TREE_OPERAND (fn, 0), BUILT_IN_SQRTF)
+	  || fndecl_built_in_p (TREE_OPERAND (fn, 0), BUILT_IN_SQRTL)))
+    {
+      fn = build_call_expr_internal_loc (loc, IFN_FENV_SQRT,
+					 TREE_TYPE (argarray[0]), 2,
+					 argarray[0], integer_zero_node);
+      TREE_SIDE_EFFECTS (fn) = 1;
+    }
+  else
+    fn = build_call_a (fn, nargs, argarray);
   SET_EXPR_LOCATION (fn, loc);
 
   fndecl = get_callee_fndecl (fn);
 
   /* Check that arguments to builtin functions match the expectations.  */
   if (fndecl
       && !processing_template_decl
       && fndecl_built_in_p (fndecl, BUILT_IN_NORMAL))
     {
       int i;
Index: gcc/cp/typeck.c
===================================================================
--- gcc/cp/typeck.c	(revision 274007)
+++ gcc/cp/typeck.c	(working copy)
@@ -5543,20 +5543,52 @@ cp_build_binary_op (const op_location_t
 	  if (TREE_TYPE (cop0) != orig_type)
 	    cop0 = cp_convert (orig_type, op0, complain);
 	  if (TREE_TYPE (cop1) != orig_type)
 	    cop1 = cp_convert (orig_type, op1, complain);
 	  instrument_expr = ubsan_instrument_division (location, cop0, cop1);
 	}
       else if (doing_shift && sanitize_flags_p (SANITIZE_SHIFT))
 	instrument_expr = ubsan_instrument_shift (location, code, op0, op1);
     }
 
+  // FIXME: vectors (and complex?) as well
+  if (flag_fenv_access && SCALAR_FLOAT_TYPE_P (build_type))
+    {
+      bool do_fenv_subst = true;
+      internal_fn ifn;
+      switch (resultcode)
+	{
+	case PLUS_EXPR:
+	  ifn = IFN_FENV_PLUS;
+	  break;
+	case MINUS_EXPR:
+	  ifn = IFN_FENV_MINUS;
+	  break;
+	case MULT_EXPR:
+	  ifn = IFN_FENV_MULT;
+	  break;
+	case RDIV_EXPR:
+	  ifn = IFN_FENV_RDIV;
+	  break;
+	default:
+	  do_fenv_subst = false;
+	}
+      if (do_fenv_subst)
+	{
+	  result = build_call_expr_internal_loc (location, ifn, build_type,
+						 3, op0, op1,
+						 integer_zero_node);
+	  TREE_SIDE_EFFECTS (result) = 1;
+	  return result;
+	}
+    }
+
   result = build2_loc (location, resultcode, build_type, op0, op1);
   if (final_type != 0)
     result = cp_convert (final_type, result, complain);
 
   if (instrument_expr != NULL)
     result = build2 (COMPOUND_EXPR, TREE_TYPE (result),
 		     instrument_expr, result);
 
   if (!processing_template_decl)
     {
@@ -6501,20 +6533,32 @@ cp_build_unary_op (enum tree_code code,
       return cp_build_addr_expr (arg, complain);
 
     default:
       break;
     }
 
   if (!errstring)
     {
       if (argtype == 0)
 	argtype = TREE_TYPE (arg);
+      // FIXME: vectors (and complex?) as well
+      if (flag_fenv_access
+	  && SCALAR_FLOAT_TYPE_P (argtype)
+	  && code == NEGATE_EXPR)
+	{
+	  tree result
+	    = build_call_expr_internal_loc (location, IFN_FENV_NEGATE,
+					    argtype, 2, arg,
+					    integer_zero_node);
+	  TREE_SIDE_EFFECTS (result) = 1;
+	  return result;
+	}
       return build1 (code, argtype, arg);
     }
 
   if (complain & tf_error)
     error ("%s", errstring);
   return error_mark_node;
 }
 
 /* Hook for the c-common bits that build a unary op.  */
 tree
Index: gcc/internal-fn.c
===================================================================
--- gcc/internal-fn.c	(revision 274007)
+++ gcc/internal-fn.c	(working copy)
@@ -23,20 +23,21 @@ along with GCC; see the file COPYING3.
 #include "backend.h"
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
 #include "gimple.h"
 #include "predict.h"
 #include "stringpool.h"
 #include "tree-vrp.h"
 #include "tree-ssanames.h"
 #include "expmed.h"
+#include "explow.h"
 #include "memmodel.h"
 #include "optabs.h"
 #include "emit-rtl.h"
 #include "diagnostic-core.h"
 #include "fold-const.h"
 #include "internal-fn.h"
 #include "stor-layout.h"
 #include "dojump.h"
 #include "expr.h"
 #include "stringpool.h"
@@ -2869,20 +2870,214 @@ expand_DIVMOD (internal_fn, gcall *call_
 }
 
 /* Expand a NOP.  */
 
 static void
 expand_NOP (internal_fn, gcall *)
 {
   /* Nothing.  But it shouldn't really prevail.  */
 }
 
+/* Checks if OP is already protected (in particular the result of some FENV
+   arithmetic) and thus can skip the inline asm.  */
+static bool
+already_protected (tree op)
+{
+  // TODO: function parameters/returns, volatile reads, etc may also be ok?
+  gimple *def_stmt;
+  if (TREE_CODE (op) != SSA_NAME
+      || !(def_stmt = SSA_NAME_DEF_STMT (op))
+      || gimple_code (def_stmt) != GIMPLE_CALL
+      || !gimple_call_internal_p (def_stmt))
+    return false;
+  switch (gimple_call_internal_fn (def_stmt))
+    {
+    case IFN_FENV_PLUS:
+    case IFN_FENV_MINUS:
+    case IFN_FENV_MULT:
+    case IFN_FENV_RDIV:
+    case IFN_FENV_NEGATE:
+    case IFN_FENV_SQRT:
+      return true;
+    default:
+      return false;
+    }
+}
+
+/* Emit asm volatile with inout memory parameter X.  */
+static void
+emit_asm_protection (rtx x, machine_mode mode, location_t loc)
+{
+  const char* constraint = "=m";
+  rtvec argvec = rtvec_alloc (1);
+  rtvec constraintvec = rtvec_alloc (1);
+  rtvec labelvec = rtvec_alloc (0);
+  rtx body = gen_rtx_ASM_OPERANDS (mode, "", "", 0, argvec, constraintvec,
+				   labelvec, loc);
+  MEM_VOLATILE_P (body) = true;
+  ASM_OPERANDS_INPUT (body, 0) = x;
+  ASM_OPERANDS_INPUT_CONSTRAINT_EXP (body, 0)
+    = gen_rtx_ASM_INPUT_loc (mode, "0", loc);
+  ASM_OPERANDS_OUTPUT_CONSTRAINT (body) = constraint;
+  emit_insn (gen_rtx_SET (x, body));
+}
+
+/* Expand FENV-protected arithmetic.  */
+static void
+expand_FENV_binop (gcall *stmt, tree_code code)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree op0 = gimple_call_arg (stmt, 0);
+  tree op1 = gimple_call_arg (stmt, 1);
+  tree ftype = TREE_TYPE (op0);
+  machine_mode fmode = TYPE_MODE (ftype);
+  location_t loc = gimple_location (stmt);
+  rtx x0, x1;
+  bool skip0 = false;
+
+  if (already_protected (op0))
+    {
+      skip0 = true;
+      x0 = expand_normal (op0);
+    }
+  else
+    {
+      x0 = validize_mem (assign_temp (ftype, true, true));
+      expand_assignment (make_tree (ftype, x0), op0, false);
+      emit_asm_protection (x0, fmode, loc);
+    }
+
+  /* Careful not to end up with something like X - X, which could get
+     simplified.  */
+  if (!skip0 && already_protected (op1))
+    {
+      x1 = expand_normal (op1);
+    }
+  else
+    {
+      x1 = validize_mem (assign_temp (ftype, true, true));
+      expand_assignment (make_tree (ftype, x1), op1, false);
+      emit_asm_protection (x1, fmode, loc);
+    }
+
+  rtx x2 = validize_mem (assign_temp (ftype, true, true));
+  // Using some intermediate registers for x0 and x1 could help x86 avoid
+  // reg+mem when it isn't necessary. It could also save a bit of stack.
+  optab o = optab_for_tree_code (code, ftype, optab_default);
+  rtx y2 = expand_binop (fmode, o, x0, x1, x2, false, OPTAB_DIRECT);
+  if (x2 != y2)
+    emit_move_insn (x2, y2);
+  emit_asm_protection (x2, fmode, loc);
+  if (lhs)
+    expand_assignment (lhs, make_tree (ftype, x2), false);
+  crtl->has_asm_statement = 1;
+}
+
+static void
+expand_FENV_PLUS (internal_fn, gcall *stmt)
+{
+  expand_FENV_binop (stmt, PLUS_EXPR);
+}
+
+static void
+expand_FENV_MINUS (internal_fn, gcall *stmt)
+{
+  expand_FENV_binop (stmt, MINUS_EXPR);
+}
+
+static void
+expand_FENV_MULT (internal_fn, gcall *stmt)
+{
+  expand_FENV_binop (stmt, MULT_EXPR);
+}
+
+static void
+expand_FENV_RDIV (internal_fn, gcall *stmt)
+{
+  expand_FENV_binop (stmt, RDIV_EXPR);
+}
+
+static void
+expand_FENV_NEGATE (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree op = gimple_call_arg (stmt, 0);
+  tree ftype = TREE_TYPE (op);
+  machine_mode fmode = TYPE_MODE (ftype);
+  location_t loc = gimple_location (stmt);
+  rtx x;
+
+  if (already_protected (op))
+    {
+      x = expand_normal (op);
+    }
+  else
+    {
+      x = validize_mem (assign_temp (ftype, true, true));
+      expand_assignment (make_tree (ftype, x), op, false);
+      emit_asm_protection (x, fmode, loc);
+    }
+
+  rtx x2 = validize_mem (assign_temp (ftype, true, true));
+  optab o = optab_for_tree_code (NEGATE_EXPR, ftype, optab_default);
+  rtx y2 = expand_unop (fmode, o, x, x2, false);
+  if (x2 != y2)
+    emit_move_insn (x2, y2);
+  emit_asm_protection (x2, fmode, loc);
+  if (lhs)
+    expand_assignment (lhs, make_tree (ftype, x2), false);
+  crtl->has_asm_statement = 1;
+}
+
+static void
+expand_FENV_SQRT (internal_fn, gcall *stmt)
+{
+  // We cannot delegate to expand_call_stmt of a dummy stmt with BUILT_IN_SQRT
+  // because it does not always generate a call.
+  tree lhs = gimple_call_lhs (stmt);
+  tree op = gimple_call_arg (stmt, 0);
+  tree ftype = TREE_TYPE (op);
+  machine_mode fmode = TYPE_MODE (ftype);
+  location_t loc = gimple_location (stmt);
+  built_in_function f;
+  if (fmode == TYPE_MODE (float_type_node))
+    f = BUILT_IN_SQRTF;
+  else if (fmode == TYPE_MODE (double_type_node))
+    f = BUILT_IN_SQRT;
+  else if (fmode == TYPE_MODE (long_double_type_node))
+    f = BUILT_IN_SQRTL;
+  else
+    gcc_unreachable();
+  tree sqrtfn = mathfn_built_in (ftype, f);
+  tree exp = build_vl_exp (CALL_EXPR, 1 + 3);
+  CALL_EXPR_FN (exp)
+    = build1_loc (loc, ADDR_EXPR,
+		  build_pointer_type (TREE_TYPE (sqrtfn)), sqrtfn);
+  TREE_TYPE (exp) = ftype;
+  CALL_EXPR_STATIC_CHAIN (exp) = gimple_call_chain (stmt); // ???
+  TREE_SIDE_EFFECTS (exp) = 1;
+  TREE_NOTHROW (exp) = 1;
+  CALL_EXPR_TAILCALL (exp) = gimple_call_tail_p (stmt);
+  CALL_FROM_THUNK_P (exp) = gimple_call_from_thunk_p (stmt);
+  CALL_EXPR_ARG (exp, 0) = op;
+  SET_EXPR_LOCATION (exp, loc);
+  // debug?
+
+  rtx res = validize_mem (assign_temp (ftype, true, true));
+  tree tmp = make_tree (ftype, res);
+  expand_assignment (tmp, exp, false);
+  // Still needed to avoid DCE when the result is unused.
+  emit_asm_protection (res, fmode, loc);
+  if (lhs)
+    expand_assignment (lhs, tmp, false);
+}
+
 /* Expand a call to FN using the operands in STMT.  FN has a single
    output operand and NARGS input operands.  */
 
 static void
 expand_direct_optab_fn (internal_fn fn, gcall *stmt, direct_optab optab,
 			unsigned int nargs)
 {
   expand_operand *ops = XALLOCAVEC (expand_operand, nargs + 1);
 
   tree_pair types = direct_internal_fn_types (fn, stmt);
Index: gcc/internal-fn.def
===================================================================
--- gcc/internal-fn.def	(revision 274007)
+++ gcc/internal-fn.def	(working copy)
@@ -345,16 +345,24 @@ DEF_INTERNAL_FN (FALLTHROUGH, ECF_LEAF |
 
 /* To implement __builtin_launder.  */
 DEF_INTERNAL_FN (LAUNDER, ECF_LEAF | ECF_NOTHROW | ECF_NOVOPS, NULL)
 
 /* Divmod function.  */
 DEF_INTERNAL_FN (DIVMOD, ECF_CONST | ECF_LEAF, NULL)
 
 /* A NOP function with arbitrary arguments and return value.  */
 DEF_INTERNAL_FN (NOP, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
 
+/* float operations with rounding / exception flags.  */
+DEF_INTERNAL_FN (FENV_PLUS, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (FENV_MINUS, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (FENV_MULT, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (FENV_RDIV, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (FENV_NEGATE, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (FENV_SQRT, ECF_LEAF | ECF_NOTHROW, NULL)
+
 #undef DEF_INTERNAL_INT_FN
 #undef DEF_INTERNAL_FLT_FN
 #undef DEF_INTERNAL_FLT_FLOATN_FN
 #undef DEF_INTERNAL_SIGNED_OPTAB_FN
 #undef DEF_INTERNAL_OPTAB_FN
 #undef DEF_INTERNAL_FN

Reply via email to