Hi!

On Fri, 17 Feb 2017 14:00:09 +0100, I wrote:
> [...] for "normal" functions there is no reason to use the
> ".param" space for passing arguments in and out of functions.  We can
> then get rid of the boilerplate code to move ".param %in_ar*" into ".reg
> %ar*", and the other way round for "%value_out"/"%value".  This will then
> also simplify the call sites, where all that code "evaporates".  That's
> actually something I started to look into, many months ago, and I now
> just dug out those changes, and will post them later.
> 
> (Very likely, the PTX "JIT" compiler will do the very same thing without
> difficulty, but why not directly generate code that is less verbose to
> read?)

Using my WIP patch, the generated PTX code changes/is simplified as
follows:

     // BEGIN GLOBAL FUNCTION DECL: f
    -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 
%in_ar1);
    +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1);
    
     // BEGIN GLOBAL FUNCTION DEF: f
    -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 
%in_ar1)
    +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1)
     {
            .reg.f32 %value;
    -       .reg.u32 %ar0;
    -       ld.param.u32 %ar0, [%in_ar0];
    -       .reg.u64 %ar1;
    -       ld.param.u64 %ar1, [%in_ar1];
            .reg.f64 %r23;
            .reg.f32 %r24;
            .reg.u32 %r25;
    @@ -34,15 +30,15 @@ $L3:
                    mov.f32 %r24, 0f00000000;
     $L1:
                    mov.f32 %value, %r24;
    -       st.param.f32    [%value_out], %value;
    +       mov.f32 %value_out, %value;
            ret;
     }
    
     // BEGIN GLOBAL FUNCTION DECL: main
    -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, 
.param.u64 %in_ar1);
    +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1);
    
     // BEGIN GLOBAL FUNCTION DEF: main
    -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, 
.param.u64 %in_ar1)
    +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1)
     {
            .reg.u32 %value;
            .local .align 8 .b8 %frame_ar[32];
    @@ -70,13 +66,9 @@ $L1:
                    st.u64  [%frame+24], %r29;
                    add.u64 %r31, %frame, 16;
            {
    -               .param.f32 %value_in;
    -               .param.u32 %out_arg1;
    -               st.param.u32 [%out_arg1], %r26;
    -               .param.u64 %out_arg2;
    -               st.param.u64 [%out_arg2], %r31;
    -               call (%value_in), f, (%out_arg1, %out_arg2);
    -               ld.param.f32    %r32, [%value_in];
    +               .reg.f32 %value_in;
    +               call (%value_in), f, (%r26, %r31);
    +               mov.f32 %r32, %value_in;
            }
                    setp.eq.f32     %r33, %r32, 0f00000000;
            @%r33   bra     $L5;
    @@ -89,17 +81,13 @@ $L5:
                    st.u64  [%frame+24], %r36;
                    mov.u32 %r34, 1;
            {
    -               .param.f32 %value_in;
    -               .param.u32 %out_arg1;
    -               st.param.u32 [%out_arg1], %r34;
    -               .param.u64 %out_arg2;
    -               st.param.u64 [%out_arg2], %r31;
    -               call (%value_in), f, (%out_arg1, %out_arg2);
    -               ld.param.f32    %r39, [%value_in];
    +               .reg.f32 %value_in;
    +               call (%value_in), f, (%r34, %r31);
    +               mov.f32 %r39, %value_in;
            }
                    setp.neu.f32    %r40, %r39, 0f3f800000;
            @%r40   bra     $L6;
                    mov.u32 %value, 0;
    -       st.param.u32    [%value_out], %value;
    +       mov.u32 %value_out, %value;
            ret;
     }

(Not yet directly using "%value_out" instead of the intermediate "%value"
register.)

Is such a patch something to pursue to completion?

--- gcc/config/nvptx/nvptx.c
+++ gcc/config/nvptx/nvptx.c
@@ -603,19 +603,32 @@ nvptx_promote_function_mode (const_tree type, 
machine_mode mode,
    to an argument register and it is greater than zero if we're
    copying to a specific hard register.  */
 
+static bool write_as_kernel (tree attrs);
 static int
 write_arg_mode (std::stringstream &s, int for_reg, int argno,
-               machine_mode mode)
+               machine_mode mode, const_tree decl)
 {
+  bool kernel = (decl != NULL_TREE) && write_as_kernel (DECL_ATTRIBUTES 
(decl));
   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
 
   if (for_reg < 0)
     {
       /* Writing PTX prototype.  */
       s << (argno ? ", " : " (");
-      s << ".param" << ptx_type << " %in_ar" << argno;
+      if (kernel)
+       s << ".param" << ptx_type << " %in_ar" << argno;
+      else
+#if 0
+       s << ".reg" << ptx_type << " %in_ar" << argno;
+#else
+       s << ".reg" << ptx_type << " %ar" << argno;
+#endif
     }
+#if 0
   else
+#else
+  else if (kernel || for_reg)
+#endif
     {
       s << "\t.reg" << ptx_type << " ";
       if (for_reg)
@@ -625,12 +638,31 @@ write_arg_mode (std::stringstream &s, int for_reg, int 
argno,
       s << ";\n";
       if (argno >= 0)
        {
-         s << "\tld.param" << ptx_type << " ";
-         if (for_reg)
-           s << reg_names[for_reg];
+         if (kernel)
+           {
+             s << "\tld.param" << ptx_type << " ";
+             if (for_reg)
+               s << reg_names[for_reg];
+             else
+               s << "%ar" << argno;
+             s << ", [%in_ar" << argno << "];\n";
+           }
          else
-           s << "%ar" << argno;
-         s << ", [%in_ar" << argno << "];\n";
+           {
+             s << "\tmov" << ptx_type << " ";
+             if (for_reg)
+               s << reg_names[for_reg];
+             else
+               s << "%ar" << argno;
+             /* TODO: we should directly emit "reg_names[for_reg]" above when
+                writing prototype, but will need to change all call sites,
+                because these just pass in -1 for for_reg.  With that changed,
+                we can then avoid this additional ".reg", and the "mov".  */
+             if (for_reg)
+               s << ", %ar" << argno << ";\n";
+             else
+               s << ", %in_ar" << argno << ";\n";
+           }
        }
     }
   return argno + 1;
@@ -646,7 +678,7 @@ write_arg_mode (std::stringstream &s, int for_reg, int 
argno,
 
 static int
 write_arg_type (std::stringstream &s, int for_reg, int argno,
-               tree type, bool prototyped)
+               tree type, bool prototyped, const_tree decl)
 {
   machine_mode mode = TYPE_MODE (type);
 
@@ -669,24 +701,31 @@ write_arg_type (std::stringstream &s, int for_reg, int 
argno,
 
       mode = promote_arg (mode, prototyped);
       if (split)
-       argno = write_arg_mode (s, for_reg, argno, mode);
+       argno = write_arg_mode (s, for_reg, argno, mode, decl);
     }
 
-  return write_arg_mode (s, for_reg, argno, mode);
+  return write_arg_mode (s, for_reg, argno, mode, decl);
 }
 
 /* Emit a PTX return as a prototype or function prologue declaration
    for MODE.  */
 
 static void
-write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
+write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode, 
const_tree decl)
 {
   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
   const char *pfx = "\t.reg";
   const char *sfx = ";\n";
   
   if (for_proto)
-    pfx = "(.param", sfx = "_out) ";
+    {
+      bool kernel = (decl != NULL_TREE) && write_as_kernel (DECL_ATTRIBUTES 
(decl));
+      if (kernel)
+       pfx = "(.param";
+      else
+       pfx = "(.reg";
+      sfx = "_out) ";
+    }
   
   s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
 }
@@ -697,7 +736,7 @@ write_return_mode (std::stringstream &s, bool for_proto, 
machine_mode mode)
    match the regular GCC function return mashalling.  */
 
 static bool
-write_return_type (std::stringstream &s, bool for_proto, tree type)
+write_return_type (std::stringstream &s, bool for_proto, tree type, const_tree 
decl)
 {
   machine_mode mode = TYPE_MODE (type);
 
@@ -728,7 +767,7 @@ write_return_type (std::stringstream &s, bool for_proto, 
tree type)
   else
     mode = promote_return (mode);
 
-  write_return_mode (s, for_proto, mode);
+  write_return_mode (s, for_proto, mode, decl);
 
   return return_in_mem;
 }
@@ -824,7 +863,7 @@ write_fn_proto (std::stringstream &s, bool is_defn,
       }
 
   /* Declare the result.  */
-  bool return_in_mem = write_return_type (s, true, result_type);
+  bool return_in_mem = write_return_type (s, true, result_type, decl);
 
   s << name;
 
@@ -832,7 +871,7 @@ write_fn_proto (std::stringstream &s, bool is_defn,
 
   /* Emit argument list.  */
   if (return_in_mem)
-    argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+    argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
 
   /* We get:
      NULL in TYPE_ARG_TYPES, for old-style functions
@@ -852,21 +891,21 @@ write_fn_proto (std::stringstream &s, bool is_defn,
       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
       
       if (not_atomic_weak_arg)
-       argno = write_arg_type (s, -1, argno, type, prototyped);
+       argno = write_arg_type (s, -1, argno, type, prototyped, decl);
       else
        gcc_assert (type == boolean_type_node);
     }
 
   if (stdarg_p (fntype))
-    argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+    argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
 
   if (DECL_STATIC_CHAIN (decl))
-    argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+    argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
 
   if (!argno && strcmp (name, "main") == 0)
     {
-      argno = write_arg_type (s, -1, argno, integer_type_node, true);
-      argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+      argno = write_arg_type (s, -1, argno, integer_type_node, true, decl);
+      argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
     }
 
   if (argno)
@@ -899,7 +938,7 @@ write_fn_proto_from_insn (std::stringstream &s, const char 
*name,
     }
 
   if (result != NULL_RTX)
-    write_return_mode (s, true, GET_MODE (result));
+    write_return_mode (s, true, GET_MODE (result), NULL_TREE);
 
   s << name;
 
@@ -911,7 +950,7 @@ write_fn_proto_from_insn (std::stringstream &s, const char 
*name,
         sequence.  */
       machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
 
-      write_arg_mode (s, -1, i - 1, mode);
+      write_arg_mode (s, -1, i - 1, mode, NULL_TREE);
     }
   if (arg_end != 1)
     s << ")";
@@ -1189,9 +1228,9 @@ nvptx_declare_function_name (FILE *file, const char 
*name, const_tree decl)
   write_fn_proto (s, true, name, decl);
   s << "{\n";
 
-  bool return_in_mem = write_return_type (s, false, result_type);
+  bool return_in_mem = write_return_type (s, false, result_type, decl);
   if (return_in_mem)
-    argno = write_arg_type (s, 0, argno, ptr_type_node, true);
+    argno = write_arg_type (s, 0, argno, ptr_type_node, true, decl);
   
   /* Declare and initialize incoming arguments.  */
   tree args = TYPE_ARG_TYPES (fntype);
@@ -1206,17 +1245,17 @@ nvptx_declare_function_name (FILE *file, const char 
*name, const_tree decl)
     {
       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
 
-      argno = write_arg_type (s, 0, argno, type, prototyped);
+      argno = write_arg_type (s, 0, argno, type, prototyped, decl);
     }
 
   if (stdarg_p (fntype))
     argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
-                           true);
+                           true, decl);
 
   if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
     write_arg_type (s, STATIC_CHAIN_REGNUM,
                    DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
-                   true);
+                   true, decl);
 
   fprintf (file, "%s", s.str().c_str());
 
@@ -1290,8 +1329,14 @@ nvptx_output_return (void)
 {
   machine_mode mode = (machine_mode)cfun->machine->return_mode;
 
+  const char *fmt;
+  bool kernel = write_as_kernel (DECL_ATTRIBUTES (current_function_decl));
+  if (kernel)
+    fmt = "\tst.param%s\t[%s_out], %s;\n";
+  else
+    fmt = "\tmov%s\t%s_out, %s;\n";
   if (mode != VOIDmode)
-    fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
+    fprintf (asm_out_file, fmt,
             nvptx_ptx_type_from_mode (mode, false),
             reg_names[NVPTX_RETURN_REGNUM],
             reg_names[NVPTX_RETURN_REGNUM]);
@@ -2063,7 +2108,9 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx 
callee)
 
   fprintf (asm_out_file, "\t{\n");
   if (result != NULL)
-    fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
+    //We can never have a kernel call another kernel.
+    //fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
+    fprintf (asm_out_file, "\t\t.reg%s %s_in;\n",
             nvptx_ptx_type_from_mode (GET_MODE (result), false),
             reg_names[NVPTX_RETURN_REGNUM]);
 
@@ -2088,6 +2135,7 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx 
callee)
       fputs (s.str().c_str(), asm_out_file);
     }
 
+#if 0
   for (int argno = 1; argno < arg_end; argno++)
     {
       rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
@@ -2095,12 +2143,14 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx 
callee)
       const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
 
       /* Mode splitting has already been done.  */
-      fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
-              "\t\tst.param%s [%%out_arg%d], ",
+      //We can never have a kernel call another kernel, so don't need to care 
for ".param" here.
+      fprintf (asm_out_file, "\t\t.reg%s %%out_arg%d;\n"
+              "\t\tmov%s %%out_arg%d, ",
               ptx_type, argno, ptx_type, argno);
       output_reg (asm_out_file, REGNO (t), VOIDmode);
       fprintf (asm_out_file, ";\n");
     }
+#endif
 
   /* The '.' stands for the call's predicate, if any.  */
   nvptx_print_operand (asm_out_file, NULL_RTX, '.');
@@ -2120,7 +2170,13 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx 
callee)
   const char *open = "(";
   for (int argno = 1; argno < arg_end; argno++)
     {
+#if 0
       fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
+#else
+      fprintf (asm_out_file, ", %s", open);
+      rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
+      output_reg (asm_out_file, REGNO (t), VOIDmode);
+#endif
       open = "";
     }
   if (decl && DECL_STATIC_CHAIN (decl))
@@ -2147,11 +2203,12 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx 
callee)
 
   if (result)
     {
-      static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
+      //We can never have a kernel call another kernel.
+      static char rval[sizeof ("\tmov%%t0\t%%0, %%%s_in;\n\t}") + 8];
 
       if (!rval[0])
        /* We must escape the '%' that starts RETURN_REGNUM.  */
-       sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
+       sprintf (rval, "\tmov%%t0\t%%0, %%%s_in;\n\t}",
                 reg_names[NVPTX_RETURN_REGNUM]);
       return rval;
     }


Grüße
 Thomas

Reply via email to