Hi! On Fri, 17 Feb 2017 14:00:09 +0100, I wrote: > [...] for "normal" functions there is no reason to use the > ".param" space for passing arguments in and out of functions. We can > then get rid of the boilerplate code to move ".param %in_ar*" into ".reg > %ar*", and the other way round for "%value_out"/"%value". This will then > also simplify the call sites, where all that code "evaporates". That's > actually something I started to look into, many months ago, and I now > just dug out those changes, and will post them later. > > (Very likely, the PTX "JIT" compiler will do the very same thing without > difficulty, but why not directly generate code that is less verbose to > read?)
Using my WIP patch, the generated PTX code changes/is simplified as follows: // BEGIN GLOBAL FUNCTION DECL: f -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1); +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1); // BEGIN GLOBAL FUNCTION DEF: f -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1) +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1) { .reg.f32 %value; - .reg.u32 %ar0; - ld.param.u32 %ar0, [%in_ar0]; - .reg.u64 %ar1; - ld.param.u64 %ar1, [%in_ar1]; .reg.f64 %r23; .reg.f32 %r24; .reg.u32 %r25; @@ -34,15 +30,15 @@ $L3: mov.f32 %r24, 0f00000000; $L1: mov.f32 %value, %r24; - st.param.f32 [%value_out], %value; + mov.f32 %value_out, %value; ret; } // BEGIN GLOBAL FUNCTION DECL: main -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1); +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1); // BEGIN GLOBAL FUNCTION DEF: main -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1) +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1) { .reg.u32 %value; .local .align 8 .b8 %frame_ar[32]; @@ -70,13 +66,9 @@ $L1: st.u64 [%frame+24], %r29; add.u64 %r31, %frame, 16; { - .param.f32 %value_in; - .param.u32 %out_arg1; - st.param.u32 [%out_arg1], %r26; - .param.u64 %out_arg2; - st.param.u64 [%out_arg2], %r31; - call (%value_in), f, (%out_arg1, %out_arg2); - ld.param.f32 %r32, [%value_in]; + .reg.f32 %value_in; + call (%value_in), f, (%r26, %r31); + mov.f32 %r32, %value_in; } setp.eq.f32 %r33, %r32, 0f00000000; @%r33 bra $L5; @@ -89,17 +81,13 @@ $L5: st.u64 [%frame+24], %r36; mov.u32 %r34, 1; { - .param.f32 %value_in; - .param.u32 %out_arg1; - st.param.u32 [%out_arg1], %r34; - .param.u64 %out_arg2; - st.param.u64 [%out_arg2], %r31; - call (%value_in), f, (%out_arg1, %out_arg2); - ld.param.f32 %r39, [%value_in]; + .reg.f32 %value_in; + call (%value_in), f, (%r34, %r31); + mov.f32 %r39, %value_in; } setp.neu.f32 %r40, %r39, 0f3f800000; @%r40 bra $L6; mov.u32 %value, 0; - st.param.u32 [%value_out], %value; + mov.u32 %value_out, %value; ret; } (Not yet directly using "%value_out" instead of the intermediate "%value" register.) Is such a patch something to pursue to completion? --- gcc/config/nvptx/nvptx.c +++ gcc/config/nvptx/nvptx.c @@ -603,19 +603,32 @@ nvptx_promote_function_mode (const_tree type, machine_mode mode, to an argument register and it is greater than zero if we're copying to a specific hard register. */ +static bool write_as_kernel (tree attrs); static int write_arg_mode (std::stringstream &s, int for_reg, int argno, - machine_mode mode) + machine_mode mode, const_tree decl) { + bool kernel = (decl != NULL_TREE) && write_as_kernel (DECL_ATTRIBUTES (decl)); const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); if (for_reg < 0) { /* Writing PTX prototype. */ s << (argno ? ", " : " ("); - s << ".param" << ptx_type << " %in_ar" << argno; + if (kernel) + s << ".param" << ptx_type << " %in_ar" << argno; + else +#if 0 + s << ".reg" << ptx_type << " %in_ar" << argno; +#else + s << ".reg" << ptx_type << " %ar" << argno; +#endif } +#if 0 else +#else + else if (kernel || for_reg) +#endif { s << "\t.reg" << ptx_type << " "; if (for_reg) @@ -625,12 +638,31 @@ write_arg_mode (std::stringstream &s, int for_reg, int argno, s << ";\n"; if (argno >= 0) { - s << "\tld.param" << ptx_type << " "; - if (for_reg) - s << reg_names[for_reg]; + if (kernel) + { + s << "\tld.param" << ptx_type << " "; + if (for_reg) + s << reg_names[for_reg]; + else + s << "%ar" << argno; + s << ", [%in_ar" << argno << "];\n"; + } else - s << "%ar" << argno; - s << ", [%in_ar" << argno << "];\n"; + { + s << "\tmov" << ptx_type << " "; + if (for_reg) + s << reg_names[for_reg]; + else + s << "%ar" << argno; + /* TODO: we should directly emit "reg_names[for_reg]" above when + writing prototype, but will need to change all call sites, + because these just pass in -1 for for_reg. With that changed, + we can then avoid this additional ".reg", and the "mov". */ + if (for_reg) + s << ", %ar" << argno << ";\n"; + else + s << ", %in_ar" << argno << ";\n"; + } } } return argno + 1; @@ -646,7 +678,7 @@ write_arg_mode (std::stringstream &s, int for_reg, int argno, static int write_arg_type (std::stringstream &s, int for_reg, int argno, - tree type, bool prototyped) + tree type, bool prototyped, const_tree decl) { machine_mode mode = TYPE_MODE (type); @@ -669,24 +701,31 @@ write_arg_type (std::stringstream &s, int for_reg, int argno, mode = promote_arg (mode, prototyped); if (split) - argno = write_arg_mode (s, for_reg, argno, mode); + argno = write_arg_mode (s, for_reg, argno, mode, decl); } - return write_arg_mode (s, for_reg, argno, mode); + return write_arg_mode (s, for_reg, argno, mode, decl); } /* Emit a PTX return as a prototype or function prologue declaration for MODE. */ static void -write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode) +write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode, const_tree decl) { const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); const char *pfx = "\t.reg"; const char *sfx = ";\n"; if (for_proto) - pfx = "(.param", sfx = "_out) "; + { + bool kernel = (decl != NULL_TREE) && write_as_kernel (DECL_ATTRIBUTES (decl)); + if (kernel) + pfx = "(.param"; + else + pfx = "(.reg"; + sfx = "_out) "; + } s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx; } @@ -697,7 +736,7 @@ write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode) match the regular GCC function return mashalling. */ static bool -write_return_type (std::stringstream &s, bool for_proto, tree type) +write_return_type (std::stringstream &s, bool for_proto, tree type, const_tree decl) { machine_mode mode = TYPE_MODE (type); @@ -728,7 +767,7 @@ write_return_type (std::stringstream &s, bool for_proto, tree type) else mode = promote_return (mode); - write_return_mode (s, for_proto, mode); + write_return_mode (s, for_proto, mode, decl); return return_in_mem; } @@ -824,7 +863,7 @@ write_fn_proto (std::stringstream &s, bool is_defn, } /* Declare the result. */ - bool return_in_mem = write_return_type (s, true, result_type); + bool return_in_mem = write_return_type (s, true, result_type, decl); s << name; @@ -832,7 +871,7 @@ write_fn_proto (std::stringstream &s, bool is_defn, /* Emit argument list. */ if (return_in_mem) - argno = write_arg_type (s, -1, argno, ptr_type_node, true); + argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl); /* We get: NULL in TYPE_ARG_TYPES, for old-style functions @@ -852,21 +891,21 @@ write_fn_proto (std::stringstream &s, bool is_defn, tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args); if (not_atomic_weak_arg) - argno = write_arg_type (s, -1, argno, type, prototyped); + argno = write_arg_type (s, -1, argno, type, prototyped, decl); else gcc_assert (type == boolean_type_node); } if (stdarg_p (fntype)) - argno = write_arg_type (s, -1, argno, ptr_type_node, true); + argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl); if (DECL_STATIC_CHAIN (decl)) - argno = write_arg_type (s, -1, argno, ptr_type_node, true); + argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl); if (!argno && strcmp (name, "main") == 0) { - argno = write_arg_type (s, -1, argno, integer_type_node, true); - argno = write_arg_type (s, -1, argno, ptr_type_node, true); + argno = write_arg_type (s, -1, argno, integer_type_node, true, decl); + argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl); } if (argno) @@ -899,7 +938,7 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name, } if (result != NULL_RTX) - write_return_mode (s, true, GET_MODE (result)); + write_return_mode (s, true, GET_MODE (result), NULL_TREE); s << name; @@ -911,7 +950,7 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name, sequence. */ machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0)); - write_arg_mode (s, -1, i - 1, mode); + write_arg_mode (s, -1, i - 1, mode, NULL_TREE); } if (arg_end != 1) s << ")"; @@ -1189,9 +1228,9 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) write_fn_proto (s, true, name, decl); s << "{\n"; - bool return_in_mem = write_return_type (s, false, result_type); + bool return_in_mem = write_return_type (s, false, result_type, decl); if (return_in_mem) - argno = write_arg_type (s, 0, argno, ptr_type_node, true); + argno = write_arg_type (s, 0, argno, ptr_type_node, true, decl); /* Declare and initialize incoming arguments. */ tree args = TYPE_ARG_TYPES (fntype); @@ -1206,17 +1245,17 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) { tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args); - argno = write_arg_type (s, 0, argno, type, prototyped); + argno = write_arg_type (s, 0, argno, type, prototyped, decl); } if (stdarg_p (fntype)) argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node, - true); + true, decl); if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain) write_arg_type (s, STATIC_CHAIN_REGNUM, DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node, - true); + true, decl); fprintf (file, "%s", s.str().c_str()); @@ -1290,8 +1329,14 @@ nvptx_output_return (void) { machine_mode mode = (machine_mode)cfun->machine->return_mode; + const char *fmt; + bool kernel = write_as_kernel (DECL_ATTRIBUTES (current_function_decl)); + if (kernel) + fmt = "\tst.param%s\t[%s_out], %s;\n"; + else + fmt = "\tmov%s\t%s_out, %s;\n"; if (mode != VOIDmode) - fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n", + fprintf (asm_out_file, fmt, nvptx_ptx_type_from_mode (mode, false), reg_names[NVPTX_RETURN_REGNUM], reg_names[NVPTX_RETURN_REGNUM]); @@ -2063,7 +2108,9 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) fprintf (asm_out_file, "\t{\n"); if (result != NULL) - fprintf (asm_out_file, "\t\t.param%s %s_in;\n", + //We can never have a kernel call another kernel. + //fprintf (asm_out_file, "\t\t.param%s %s_in;\n", + fprintf (asm_out_file, "\t\t.reg%s %s_in;\n", nvptx_ptx_type_from_mode (GET_MODE (result), false), reg_names[NVPTX_RETURN_REGNUM]); @@ -2088,6 +2135,7 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) fputs (s.str().c_str(), asm_out_file); } +#if 0 for (int argno = 1; argno < arg_end; argno++) { rtx t = XEXP (XVECEXP (pat, 0, argno), 0); @@ -2095,12 +2143,14 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); /* Mode splitting has already been done. */ - fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n" - "\t\tst.param%s [%%out_arg%d], ", + //We can never have a kernel call another kernel, so don't need to care for ".param" here. + fprintf (asm_out_file, "\t\t.reg%s %%out_arg%d;\n" + "\t\tmov%s %%out_arg%d, ", ptx_type, argno, ptx_type, argno); output_reg (asm_out_file, REGNO (t), VOIDmode); fprintf (asm_out_file, ";\n"); } +#endif /* The '.' stands for the call's predicate, if any. */ nvptx_print_operand (asm_out_file, NULL_RTX, '.'); @@ -2120,7 +2170,13 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) const char *open = "("; for (int argno = 1; argno < arg_end; argno++) { +#if 0 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno); +#else + fprintf (asm_out_file, ", %s", open); + rtx t = XEXP (XVECEXP (pat, 0, argno), 0); + output_reg (asm_out_file, REGNO (t), VOIDmode); +#endif open = ""; } if (decl && DECL_STATIC_CHAIN (decl)) @@ -2147,11 +2203,12 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) if (result) { - static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8]; + //We can never have a kernel call another kernel. + static char rval[sizeof ("\tmov%%t0\t%%0, %%%s_in;\n\t}") + 8]; if (!rval[0]) /* We must escape the '%' that starts RETURN_REGNUM. */ - sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}", + sprintf (rval, "\tmov%%t0\t%%0, %%%s_in;\n\t}", reg_names[NVPTX_RETURN_REGNUM]); return rval; } Grüße Thomas