Hi! On 2024-09-20T18:49:46+0200, I wrote: > We'd like to raise nvptx code generation from PTX ISA 6.0, sm_30 "Kepler" > to default PTX ISA 7.3, sm_52 "Maxwell", therefore CUDA 11.3 (2021-04). > This is, primarily, so that we're able to use 'alloca' and related stack > manipulation instructions, and improve upon the current: > > sorry ("target cannot support alloca");
Pushed to trunk branch commit 3861d362ec7e3c50742fc43833fe9d8674f4070e "nvptx: PTX 'alloca' for '-mptx=7.3'+, '-march=sm_52'+ [PR65181]", see attached. Grüße Thomas
>From 3861d362ec7e3c50742fc43833fe9d8674f4070e Mon Sep 17 00:00:00 2001 From: Thomas Schwinge <tschwi...@baylibre.com> Date: Sat, 7 Dec 2024 00:17:49 +0100 Subject: [PATCH] nvptx: PTX 'alloca' for '-mptx=7.3'+, '-march=sm_52'+ [PR65181] ..., and use it for '-mno-soft-stack': PTX "native" stacks. PR target/65181 gcc/ * config/nvptx/nvptx.cc (nvptx_get_drap_rtx): Handle '!TARGET_SOFT_STACK'. * config/nvptx/nvptx.md (define_c_enum "unspec"): Add 'UNSPEC_STACKSAVE', 'UNSPEC_STACKRESTORE'. (define_expand "allocate_stack", define_expand "save_stack_block") (define_expand "save_stack_block"): Handle '!TARGET_SOFT_STACK', PTX 'alloca'. (define_insn "@nvptx_alloca_<mode>") (define_insn "@nvptx_stacksave_<mode>") (define_insn "@nvptx_stackrestore_<mode>"): New. * doc/invoke.texi (Nvidia PTX Options): Update '-msoft-stack', '-mno-soft-stack'. * doc/sourcebuild.texi (nvptx-specific attributes): Document 'nvptx_runtime_alloca_ptx'. (Add Options): Document 'nvptx_alloca_ptx'. gcc/testsuite/ * gcc.target/nvptx/alloca-1.c: Evolve into... * gcc.target/nvptx/alloca-1-O0.c: ... this, ... * gcc.target/nvptx/alloca-1-O1.c: ... this, and... * gcc.target/nvptx/alloca-1-sm_30.c: ... this. * gcc.target/nvptx/vla-1.c: Evolve into... * gcc.target/nvptx/vla-1-O0.c: ... this, ... * gcc.target/nvptx/vla-1-O1.c: ... this, and... * gcc.target/nvptx/vla-1-sm_30.c: ... this. * gcc.c-torture/execute/pr36321.c: Adjust. * gcc.target/nvptx/__builtin_alloca_0-1-O0.c: Likewise. * gcc.target/nvptx/__builtin_alloca_0-1-O1.c: Likewise. * gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1.c: Likewise. * gcc.target/nvptx/softstack.c: Likewise. * gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1-sm_30.c: New. * gcc.target/nvptx/alloca-2-O0.c: Likewise. * gcc.target/nvptx/alloca-3-O1.c: Likewise. * gcc.target/nvptx/alloca-4-O3.c: Likewise. * gcc.target/nvptx/alloca-5.c: Likewise. * lib/target-supports.exp (check_effective_target_alloca): Adjust. (check_nvptx_default_ptx_isa_target_architecture_at_least) (check_nvptx_runtime_ptx_isa_target_architecture_at_least) (check_effective_target_nvptx_runtime_alloca_ptx) (add_options_for_nvptx_alloca_ptx): New. libgomp/ * fortran.c (omp_get_device_from_uid_): Adjust. * testsuite/libgomp.oacc-fortran/privatized-ref-2.f90: Likewise. --- gcc/config/nvptx/nvptx.cc | 4 +- gcc/config/nvptx/nvptx.md | 92 ++++++++++++--- gcc/doc/invoke.texi | 13 ++- gcc/doc/sourcebuild.texi | 6 + gcc/testsuite/gcc.c-torture/execute/pr36321.c | 3 + .../nvptx/__builtin_alloca_0-1-O0.c | 2 + .../nvptx/__builtin_alloca_0-1-O1.c | 2 + ...ack_save___builtin_stack_restore-1-sm_30.c | 28 +++++ ...tin_stack_save___builtin_stack_restore-1.c | 8 +- gcc/testsuite/gcc.target/nvptx/alloca-1-O0.c | 49 ++++++++ gcc/testsuite/gcc.target/nvptx/alloca-1-O1.c | 33 ++++++ .../nvptx/{alloca-1.c => alloca-1-sm_30.c} | 1 + gcc/testsuite/gcc.target/nvptx/alloca-2-O0.c | 12 ++ gcc/testsuite/gcc.target/nvptx/alloca-3-O1.c | 40 +++++++ gcc/testsuite/gcc.target/nvptx/alloca-4-O3.c | 55 +++++++++ gcc/testsuite/gcc.target/nvptx/alloca-5.c | 107 ++++++++++++++++++ gcc/testsuite/gcc.target/nvptx/softstack.c | 2 + gcc/testsuite/gcc.target/nvptx/vla-1-O0.c | 29 +++++ gcc/testsuite/gcc.target/nvptx/vla-1-O1.c | 40 +++++++ .../nvptx/{vla-1.c => vla-1-sm_30.c} | 1 + gcc/testsuite/lib/target-supports.exp | 105 ++++++++++++++++- libgomp/fortran.c | 4 +- .../libgomp.oacc-fortran/privatized-ref-2.f90 | 10 -- 23 files changed, 611 insertions(+), 35 deletions(-) create mode 100644 gcc/testsuite/gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1-sm_30.c create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-1-O0.c create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-1-O1.c rename gcc/testsuite/gcc.target/nvptx/{alloca-1.c => alloca-1-sm_30.c} (83%) create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-2-O0.c create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-3-O1.c create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-4-O3.c create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-5.c create mode 100644 gcc/testsuite/gcc.target/nvptx/vla-1-O0.c create mode 100644 gcc/testsuite/gcc.target/nvptx/vla-1-O1.c rename gcc/testsuite/gcc.target/nvptx/{vla-1.c => vla-1-sm_30.c} (83%) diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 5860b3df6dd7..060f45318f45 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -245,7 +245,7 @@ default_ptx_version_option (void) warp convergence. */ res = MAX (res, PTX_VERSION_6_0); - /* For sm_52+, pick at least 7.3. */ + /* For sm_52+, pick at least 7.3, to enable PTX 'alloca'. */ if (ptx_isa_option >= PTX_ISA_SM52) res = MAX (res, PTX_VERSION_7_3); @@ -1797,7 +1797,7 @@ nvptx_function_ok_for_sibcall (tree, tree) static rtx nvptx_get_drap_rtx (void) { - if (TARGET_SOFT_STACK && stack_realign_drap) + if (stack_realign_drap) return arg_pointer_rtx; return NULL_RTX; } diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index b300f2e596c5..a22a088fb3ac 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -35,8 +35,9 @@ UNSPEC_FPINT_NEARBYINT UNSPEC_ALLOCA - UNSPEC_SET_SOFTSTACK + UNSPEC_STACKSAVE + UNSPEC_STACKRESTORE UNSPEC_DIM_SIZE @@ -1663,22 +1664,47 @@ (match_operand 1 "nvptx_register_operand")] "" { - if (TARGET_SOFT_STACK) + if (!TARGET_SOFT_STACK + && TARGET_PTX_7_3 + && TARGET_SM52) + emit_insn (gen_nvptx_alloca (Pmode, operands[0], operands[1])); + else if (!TARGET_SOFT_STACK) + { + sorry ("target cannot support alloca"); + emit_insn (gen_nop ()); + } + else if (TARGET_SOFT_STACK) { emit_move_insn (stack_pointer_rtx, gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1])); emit_insn (gen_set_softstack (Pmode, stack_pointer_rtx)); emit_move_insn (operands[0], virtual_stack_dynamic_rtx); - DONE; } - /* The ptx documentation specifies an alloca intrinsic (for 32 bit - only) but notes it is not implemented. The assembler emits a - confused error message. Issue a blunt one now instead. */ - sorry ("target cannot support alloca"); - emit_insn (gen_nop ()); + else + gcc_unreachable (); DONE; }) +(define_insn "@nvptx_alloca_<mode>" + [(set (match_operand:P 0 "nvptx_register_operand" "=R") + (unspec:P [(match_operand:P 1 "nvptx_nonmemory_operand" "Ri")] + UNSPEC_ALLOCA))] + "TARGET_PTX_7_3 + && TARGET_SM52" + { + /* Convert the address from '.local' state space to generic. That way, + we don't have to use 'st.local', 'ld.local', and can easily pass the + address to other "generic functions". + TODO 'gcc.target/nvptx/alloca-5.c' */ + output_asm_insn ("{", NULL); + output_asm_insn ("\\t.reg%t0\\t%0_local;", operands); + output_asm_insn ("\\talloca%u0\\t%0_local, %1;", operands); + output_asm_insn ("\\tcvta.local%u0\\t%0, %0_local;", operands); + output_asm_insn ("}", NULL); + return ""; + } + [(set_attr "predicable" "no")]) + (define_insn "@set_softstack_<mode>" [(unspec [(match_operand:P 0 "nvptx_register_operand" "R")] UNSPEC_SET_SOFTSTACK)] @@ -1692,30 +1718,64 @@ (match_operand 1 "register_operand" "")] "!TARGET_SOFT_STACK" { - /* The concept of a '%stack' pointer doesn't apply like this for - PTX "native" stacks. GCC however occasionally synthesizes - '__builtin_stack_save ()', '__builtin_stack_restore ()', and isn't able to - optimize them all away. Just submit a dummy -- user code shouldn't be - able to observe this. */ - emit_move_insn (operands[0], GEN_INT (0xdeadbeef)); + if (TARGET_PTX_7_3 + && TARGET_SM52) + { + gcc_checking_assert (REG_P (operands[0])); + emit_insn (gen_nvptx_stacksave (Pmode, operands[0], operands[1])); + } + else + { + /* The concept of a '%stack' pointer doesn't apply like this. + GCC however occasionally synthesizes '__builtin_stack_save ()', + '__builtin_stack_restore ()', and isn't able to optimize them all + away. Just submit a dummy -- user code shouldn't be able to observe + this. */ + emit_move_insn (operands[0], GEN_INT (0xdeadbeef)); + } DONE; }) +(define_insn "@nvptx_stacksave_<mode>" + [(set (match_operand:P 0 "nvptx_register_operand" "=R") + (unspec:P [(match_operand:P 1 "register_operand" "R")] + UNSPEC_STACKSAVE))] + "TARGET_PTX_7_3 + && TARGET_SM52" + "%.\\tstacksave%u0\\t%0;") + (define_expand "restore_stack_block" [(match_operand 0 "register_operand" "") (match_operand 1 "register_operand" "")] "" { - if (!TARGET_SOFT_STACK) + if (!TARGET_SOFT_STACK + && TARGET_PTX_7_3 + && TARGET_SM52) + { + operands[1] = force_reg (Pmode, operands[1]); + emit_insn (gen_nvptx_stackrestore (Pmode, operands[0], operands[1])); + } + else if (!TARGET_SOFT_STACK) ; /* See 'save_stack_block'. */ - else + else if (TARGET_SOFT_STACK) { emit_move_insn (operands[0], operands[1]); emit_insn (gen_set_softstack (Pmode, operands[0])); } + else + gcc_unreachable (); DONE; }) +(define_insn "@nvptx_stackrestore_<mode>" + [(set (match_operand:P 0 "nvptx_register_operand" "=R") + (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")] + UNSPEC_STACKRESTORE))] + "TARGET_PTX_7_3 + && TARGET_SM52" + "%.\\tstackrestore%u1\\t%1;") + (define_expand "save_stack_function" [(match_operand 0 "register_operand" "") (match_operand 1 "register_operand" "")] diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 4583181f4f53..17fe2c64c1f8 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -30232,8 +30232,19 @@ Apply partitioned execution optimizations. This is the default when any level of optimization is selected. @opindex msoft-stack +@opindex mno-soft-stack @item -msoft-stack -Generate code that does not use @code{.local} memory +@itemx -mno-soft-stack +For @option{-mno-soft-stack} (the default, unless @option{-mgomp} has +been specified), use PTX ``native'' stacks, that is, +generate code that uses @code{.local} memory or PTX @code{alloca} +directly for stack storage. +Unless @option{-mptx=7.3} or higher and @option{-march=sm_52} or +higher are active, variable-length arrays and dynamically allocating +memory on the stack with @code{alloca} are not supported. + +For @option{-msoft-stack} (implied by @option{-mgomp}), +generate code that does not use @code{.local} memory or PTX @code{alloca} directly for stack storage. Instead, a per-warp stack pointer is maintained explicitly. This enables variable-length stack allocation (with variable-length arrays or @code{alloca}), and when global memory is used for diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index 0e0a7c806ebc..b5c1b23e5271 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -2434,6 +2434,9 @@ nvptx code by default compiles for at least PTX ISA version 6.0. The nvptx runtime environment supports the PTX ISA directive @code{.alias}. +@item nvptx_runtime_alloca_ptx +The nvptx runtime environment supports PTX 'alloca'. + @item nvptx_softstack nvptx @option{-msoft-stack} is enabled. @end table @@ -3359,6 +3362,9 @@ Only MIPS targets support this feature, and only then in certain modes. @item nvptx_alias_ptx Enable using the PTX ISA directive @code{.alias} on nvptx targets. +@item nvptx_alloca_ptx +Enable PTX 'alloca' on nvptx targets. + @item riscv_a Add the 'A' extension to the -march string on RISC-V targets. diff --git a/gcc/testsuite/gcc.c-torture/execute/pr36321.c b/gcc/testsuite/gcc.c-torture/execute/pr36321.c index 4af993dc0fd9..8fd91d8c768b 100644 --- a/gcc/testsuite/gcc.c-torture/execute/pr36321.c +++ b/gcc/testsuite/gcc.c-torture/execute/pr36321.c @@ -1,4 +1,7 @@ /* { dg-skip-if "requires alloca" { ! alloca } { "-O0" } { "" } } */ +/* See 'gcc.target/nvptx/__builtin_alloca_0-1-O0.c'. + { dg-xfail-if TODO { nvptx-*-* && { ! nvptx_softstack } } { "-O0" } { "" } } */ + extern void abort (void); extern __SIZE_TYPE__ strlen (const char *); diff --git a/gcc/testsuite/gcc.target/nvptx/__builtin_alloca_0-1-O0.c b/gcc/testsuite/gcc.target/nvptx/__builtin_alloca_0-1-O0.c index 8c00a66c7387..4b8a676da1dc 100644 --- a/gcc/testsuite/gcc.target/nvptx/__builtin_alloca_0-1-O0.c +++ b/gcc/testsuite/gcc.target/nvptx/__builtin_alloca_0-1-O0.c @@ -6,6 +6,8 @@ /* { dg-additional-options -save-temps } */ /* { dg-final { check-function-bodies {** } {} } } */ +/* See 'gcc.c-torture/execute/pr36321.c', '-O0'. */ + void sink(void *); void f(void) diff --git a/gcc/testsuite/gcc.target/nvptx/__builtin_alloca_0-1-O1.c b/gcc/testsuite/gcc.target/nvptx/__builtin_alloca_0-1-O1.c index 2d61065d44d6..e81eeb5597fa 100644 --- a/gcc/testsuite/gcc.target/nvptx/__builtin_alloca_0-1-O1.c +++ b/gcc/testsuite/gcc.target/nvptx/__builtin_alloca_0-1-O1.c @@ -6,6 +6,8 @@ /* { dg-additional-options -save-temps } */ /* { dg-final { check-function-bodies {** } {} } } */ +/* See 'gcc.c-torture/execute/pr36321.c', '-O0'. */ + void sink(void *); void f(void) diff --git a/gcc/testsuite/gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1-sm_30.c b/gcc/testsuite/gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1-sm_30.c new file mode 100644 index 000000000000..c2ac4915d15d --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1-sm_30.c @@ -0,0 +1,28 @@ +/* Document what we do for '__builtin_stack_save()', '__builtin_stack_restore()'. */ + +/* { dg-do assemble } */ +/* { dg-options {-O3 -mno-soft-stack} } */ +/* { dg-additional-options -march=sm_30 } */ +/* { dg-additional-options -save-temps } */ +/* { dg-final { check-function-bodies {** } {} } } */ + +void *p; + +void f(void) +{ + // 0xdeadbeef + p = __builtin_stack_save(); + asm volatile ("" : : : "memory"); + // no-op + __builtin_stack_restore(p); + asm volatile ("" : : : "memory"); +} +/* +** f: +** \.visible \.func f +** { +** \.reg\.u64 (%r[0-9]+); +** mov\.u64 \1, 3735928559; +** st\.global\.u64 \[p\], \1; +** ret; +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1.c b/gcc/testsuite/gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1.c index 35a879fd5973..a8eda0e04afa 100644 --- a/gcc/testsuite/gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1.c +++ b/gcc/testsuite/gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1.c @@ -2,6 +2,7 @@ /* { dg-do assemble } */ /* { dg-options {-O3 -mno-soft-stack} } */ +/* { dg-add-options nvptx_alloca_ptx } */ /* { dg-additional-options -save-temps } */ /* { dg-final { check-function-bodies {** } {} } } */ @@ -9,10 +10,8 @@ void *p; void f(void) { - // 0xdeadbeef p = __builtin_stack_save(); asm volatile ("" : : : "memory"); - // no-op __builtin_stack_restore(p); asm volatile ("" : : : "memory"); } @@ -21,7 +20,10 @@ void f(void) ** \.visible \.func f ** { ** \.reg\.u64 (%r[0-9]+); -** mov\.u64 \1, 3735928559; +** \.reg\.u64 (%r[0-9]+); +** stacksave\.u64 \1; ** st\.global\.u64 \[p\], \1; +** ld\.global\.u64 \2, \[p\]; +** stackrestore\.u64 \2; ** ret; */ diff --git a/gcc/testsuite/gcc.target/nvptx/alloca-1-O0.c b/gcc/testsuite/gcc.target/nvptx/alloca-1-O0.c new file mode 100644 index 000000000000..1fa8fb5873ab --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/alloca-1-O0.c @@ -0,0 +1,49 @@ +/* { dg-do assemble } */ +/* { dg-options {-O0 -mno-soft-stack} } */ +/* { dg-add-options nvptx_alloca_ptx } */ +/* { dg-additional-options -save-temps } */ +/* { dg-final { check-function-bodies {** } {} } } */ + +void sink(void *); + +void f(void) +{ + sink(__builtin_alloca(123)); +} +/* +** f: +** \.visible \.func f +** { +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** mov\.u64 \11, 16; +** add\.u64 \2, \11, -1; +** add\.u64 \3, \2, 123; +** div\.u64 \4, \3, 16; +** mul\.lo\.u64 \5, \4, 16; +** { +** \.reg\.u64 \6_local; +** alloca\.u64 \6_local, \5; +** cvta\.local\.u64 \6, \6_local; +** } +** add\.u64 \7, \6, 15; +** shr\.u64 \8, \7, 4; +** shl\.b64 \9, \8, 4; +** mov\.u64 \1, \9; +** mov\.u64 \10, \1; +** { +** \.param\.u64 %out_arg1; +** st\.param\.u64 \[%out_arg1\], \10; +** call sink, \(%out_arg1\); +** } +** ret; +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/alloca-1-O1.c b/gcc/testsuite/gcc.target/nvptx/alloca-1-O1.c new file mode 100644 index 000000000000..9ef9d4fd41ba --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/alloca-1-O1.c @@ -0,0 +1,33 @@ +/* { dg-do assemble } */ +/* { dg-options {-O1 -mno-soft-stack} } */ +/* { dg-add-options nvptx_alloca_ptx } */ +/* { dg-additional-options -save-temps } */ +/* { dg-final { check-function-bodies {** } {} } } */ + +void sink(void *); + +void f(void) +{ + sink(__builtin_alloca(123)); +} +/* +** f: +** \.visible \.func f +** { +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** { +** \.reg\.u64 \1_local; +** alloca\.u64 \1_local, 128; +** cvta\.local\.u64 \1, \1_local; +** } +** add\.u64 \2, \1, 15; +** and\.b64 \3, \2, -16; +** { +** \.param\.u64 %out_arg1; +** st\.param\.u64 \[%out_arg1\], \3; +** call sink, \(%out_arg1\); +** } +** ret; +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/alloca-1.c b/gcc/testsuite/gcc.target/nvptx/alloca-1-sm_30.c similarity index 83% rename from gcc/testsuite/gcc.target/nvptx/alloca-1.c rename to gcc/testsuite/gcc.target/nvptx/alloca-1-sm_30.c index 0aa6f107b526..261a603ec4ce 100644 --- a/gcc/testsuite/gcc.target/nvptx/alloca-1.c +++ b/gcc/testsuite/gcc.target/nvptx/alloca-1-sm_30.c @@ -1,5 +1,6 @@ /* { dg-do compile } */ /* { dg-options -mno-soft-stack } */ +/* { dg-additional-options -march=sm_30 } */ void sink(void *); diff --git a/gcc/testsuite/gcc.target/nvptx/alloca-2-O0.c b/gcc/testsuite/gcc.target/nvptx/alloca-2-O0.c new file mode 100644 index 000000000000..cadb629c74da --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/alloca-2-O0.c @@ -0,0 +1,12 @@ +/* { dg-do link } */ +/* { dg-do run { target nvptx_runtime_alloca_ptx } } */ +/* { dg-options {-O0 -mno-soft-stack} } */ +/* { dg-add-options nvptx_alloca_ptx } */ +/* { dg-additional-options -save-temps } */ + +int +main(void) +{ + return !(__builtin_alloca(100) != __builtin_alloca(10)); +} +/* { dg-final { scan-assembler-times {(?n)\talloca\.u64\t%r[0-9]+_local, %r[0-9]+;$} 2 } } */ diff --git a/gcc/testsuite/gcc.target/nvptx/alloca-3-O1.c b/gcc/testsuite/gcc.target/nvptx/alloca-3-O1.c new file mode 100644 index 000000000000..78105760e970 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/alloca-3-O1.c @@ -0,0 +1,40 @@ +/* { dg-do assemble } */ +/* { dg-options {-O1 -mno-soft-stack} } */ +/* { dg-add-options nvptx_alloca_ptx } */ +/* { dg-additional-options -save-temps } */ +/* { dg-final { check-function-bodies {** } {} } } */ + +void sink(void *); + +void *p; + +void f(void) +{ + p = __builtin_stack_save(); + sink(__builtin_alloca(25)); + __builtin_stack_restore(p); +} +/* +** f: +** \.visible \.func f +** { +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** stacksave\.u64 \1; +** st\.global\.u64 \[p\], \1; +** { +** \.reg\.u64 \2_local; +** alloca\.u64 \2_local, 32; +** cvta\.local\.u64 \2, \2_local; +** } +** add\.u64 \3, \2, 15; +** and\.b64 \4, \3, -16; +** { +** \.param\.u64 %out_arg1; +** st\.param\.u64 \[%out_arg1\], \4; +** call sink, \(%out_arg1\); +** } +** ret; +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/alloca-4-O3.c b/gcc/testsuite/gcc.target/nvptx/alloca-4-O3.c new file mode 100644 index 000000000000..df1320ea2642 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/alloca-4-O3.c @@ -0,0 +1,55 @@ +/* { dg-do assemble } */ +/* { dg-options {-O3 -mno-soft-stack} } */ +/* { dg-add-options nvptx_alloca_ptx } */ +/* { dg-additional-options -save-temps } */ +/* { dg-final { check-function-bodies {** } {} } } */ + +void sink(void *); + +void f(void) +{ + void *p; + p = __builtin_stack_save(); + sink(__builtin_alloca(25)); + __builtin_stack_restore(p); + sink(__builtin_alloca(13)); +} +/* +** f: +** .visible .func f +** { +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** stacksave\.u64 \1; +** { +** \.reg\.u64 \2_local; +** alloca\.u64 \2_local, 32; +** cvta\.local\.u64 \2, \2_local; +** } +** add\.u64 \3, \2, 15; +** and\.b64 \4, \3, -16; +** { +** \.param\.u64 %out_arg1; +** st\.param\.u64 \[%out_arg1\], \4; +** call sink, \(%out_arg1\); +** } +** stackrestore\.u64 \1; +** { +** \.reg\.u64 \5_local; +** alloca\.u64 \5_local, 16; +** cvta\.local\.u64 \5, \5_local; +** } +** add\.u64 \6, \5, 15; +** and\.b64 \7, \6, -16; +** { +** \.param\.u64 %out_arg1; +** st\.param\.u64 \[%out_arg1\], \7; +** call sink, \(%out_arg1\); +** } +** ret; +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/alloca-5.c b/gcc/testsuite/gcc.target/nvptx/alloca-5.c new file mode 100644 index 000000000000..ada0df0d065a --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/alloca-5.c @@ -0,0 +1,107 @@ +/* { dg-do link } */ +/* { dg-do run { target nvptx_runtime_alloca_ptx } } */ +/* { dg-options {-O2 -mno-soft-stack} } */ +/* { dg-add-options nvptx_alloca_ptx } */ +/* { dg-additional-options -save-temps } */ +/* { dg-final { check-function-bodies {** } {} } } */ + +/* See also 'gcc.target/nvptx/softstack.c'. */ + +static __attribute__((noipa)) int f(int *p) +{ + return __sync_lock_test_and_set(p, 1); +} +/* +** f: +** \.func \(\.param\.u32 %value_out\) f \(\.param\.u64 %in_ar0\) +** { +** \.reg\.u32 %value; +** \.reg\.u64 %ar0; +** ld\.param\.u64 %ar0, \[%in_ar0\]; +** \.reg\.u32 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** mov\.u64 \2, %ar0; +** atom\.exch\.b32 \1, \[\2\], 1; +** membar\.sys; +** mov\.u32 %value, \1; +** st\.param\.u32 \[%value_out\], %value; +** ret; +*/ + +static __attribute__((noipa)) int g(int n) +{ + /* Check that variable-length stack allocation works. */ + int v[n]; + v[0] = 0; + /* Check that atomic operations can be applied to auto data. */ + return f(v) == 0 && v[0] == 1; +} +/* +** g: +** \.func \(\.param\.u32 %value_out\) g \(\.param\.u32 %in_ar0\) +** { +** \.reg\.u32 %value; +** \.reg\.u32 %ar0; +** ld\.param\.u32 %ar0, \[%in_ar0\]; +** \.reg\.u32 (%r[0-9]+); +** \.reg\.u32 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u32 (%r[0-9]+); +** \.reg\.u32 (%r[0-9]+); +** \.reg\.pred (%r[0-9]+); +** \.reg\.u32 (%r[0-9]+); +** \.reg\.pred (%r[0-9]+); +** mov\.u32 \2, %ar0; +** cvt\.s64\.s32 \3, \2; +** shl\.b64 \4, \3, 2; +** add\.u64 \5, \4, 15; +** and\.b64 \6, \5, -16; +** { +** \.reg\.u64 \7_local; +** alloca\.u64 \7_local, \6; +** cvta\.local\.u64 \7, \7_local; +** } +** add\.u64 \8, \7, 3; +** and\.b64 \9, \8, -4; +** mov\.u32 \10, 0; +** st\.u32 \[\9\], \10; +** { +** \.param\.u32 %value_in; +** \.param\.u64 %out_arg1; +** st\.param\.u64 \[%out_arg1\], \9; +** call \(%value_in\), f, \(%out_arg1\); +** ld\.param\.u32 \11, \[%value_in\]; +** } +** setp\.ne\.u32 \12, \11, 0; +** @\12 bra (\$L[0-9]+); +** ld\.u32 \13, \[\9\]; +** setp\.eq\.u32 \14, \13, 1; +** selp\.u32 \1, 1, 0, \14; +** bra (\$L[0-9]+); +** \15: +** mov\.u32 \1, \10; +** \16: +** mov\.u32 %value, \1; +** st\.param\.u32 \[%value_out\], %value; +** ret; +*/ + +int main() +{ + if (!g(1)) + __builtin_abort(); + return 0; +} + +/* PTX 'atom' isn't acceptable for '.local' memory: + 'operation not supported on global/shared address space' [sic] + ('CUDA_ERROR_INVALID_ADDRESS_SPACE'), thus FAILs for 'alloca'ed memory. + We'd have to use the 'nvptx_mem_local_p' replacements, but currently lack a + mechanism for doing so (TODO). + { dg-xfail-run-if TODO { *-*-* } } */ diff --git a/gcc/testsuite/gcc.target/nvptx/softstack.c b/gcc/testsuite/gcc.target/nvptx/softstack.c index 73e60f282a74..7b84a21bd4f3 100644 --- a/gcc/testsuite/gcc.target/nvptx/softstack.c +++ b/gcc/testsuite/gcc.target/nvptx/softstack.c @@ -1,6 +1,8 @@ /* { dg-options "-O2 -msoft-stack" } */ /* { dg-do run } */ +/* See also 'gcc.target/nvptx/alloca-5.c'. */ + static __attribute__((noinline,noclone)) int f(int *p) { return __sync_lock_test_and_set(p, 1); diff --git a/gcc/testsuite/gcc.target/nvptx/vla-1-O0.c b/gcc/testsuite/gcc.target/nvptx/vla-1-O0.c new file mode 100644 index 000000000000..622011f0cdd7 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/vla-1-O0.c @@ -0,0 +1,29 @@ +/* { dg-do assemble } */ +/* { dg-options {-O0 -mno-soft-stack} } */ +/* { dg-add-options nvptx_alloca_ptx } */ +/* { dg-additional-options -save-temps } */ +/* { dg-final { check-function-bodies {**} {} } } */ + +void sink(void *); + +void f(int s) +{ + char a[s]; + sink(a); +} +/* +** f: +** ... +** cvt\.s64\.s32 (%r[0-9]+), (%r[0-9]+); +** mov\.u64 (%r[0-9]+), 16; +** add\.u64 (%r[0-9]+), \3, -1; +** add\.u64 (%r[0-9]+), \1, \4; +** div\.u64 (%r[0-9]+), \5, 16; +** mul\.lo\.u64 (%r[0-9]+), \6, 16; +** { +** \.reg\.u64 (%r[0-9]+)_local; +** alloca\.u64 \8_local, \7; +** cvta\.local\.u64 \8, \8_local; +** } +** ... +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/vla-1-O1.c b/gcc/testsuite/gcc.target/nvptx/vla-1-O1.c new file mode 100644 index 000000000000..7f297a619389 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/vla-1-O1.c @@ -0,0 +1,40 @@ +/* { dg-do assemble } */ +/* { dg-options {-O1 -mno-soft-stack} } */ +/* { dg-add-options nvptx_alloca_ptx } */ +/* { dg-additional-options -save-temps } */ +/* { dg-final { check-function-bodies {** } {} } } */ + +void sink(void *); + +void f(int s) +{ + char a[s]; + sink(a); +} +/* +** f: +** \.visible \.func f \(\.param\.u32 %in_ar0\) +** { +** \.reg\.u32 %ar0; +** ld\.param\.u32 %ar0, \[%in_ar0\]; +** \.reg\.u32 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** \.reg\.u64 (%r[0-9]+); +** mov\.u32 \1, %ar0; +** cvt\.s64\.s32 \2, \1; +** add\.u64 \3, \2, 15; +** and\.b64 \4, \3, -16; +** { +** \.reg\.u64 \5_local; +** alloca\.u64 \5_local, \4; +** cvta\.local\.u64 \5, \5_local; +** } +** { +** \.param\.u64 %out_arg1; +** st\.param\.u64 \[%out_arg1\], \5; +** call sink, \(%out_arg1\); +** } +** ret; +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/vla-1.c b/gcc/testsuite/gcc.target/nvptx/vla-1-sm_30.c similarity index 83% rename from gcc/testsuite/gcc.target/nvptx/vla-1.c rename to gcc/testsuite/gcc.target/nvptx/vla-1-sm_30.c index 5baf95cecfcc..2bf2c91d60e9 100644 --- a/gcc/testsuite/gcc.target/nvptx/vla-1.c +++ b/gcc/testsuite/gcc.target/nvptx/vla-1-sm_30.c @@ -1,5 +1,6 @@ /* { dg-do compile } */ /* { dg-options -mno-soft-stack } */ +/* { dg-additional-options -march=sm_30 } */ void sink(void *); diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index d7d7217be058..a89f531f8876 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -1009,9 +1009,37 @@ proc check_effective_target_alloca {} { return 0 } if { [istarget nvptx-*-*] } { + # For nvptx, 'alloca' support depends on the configuration. In case + # of PTX "native" stacks, for 'dg-do run', it additionally depends on + # runtime support. if { ![check_effective_target_nvptx_softstack] } { - return 0 + # '-mno-soft-stack': PTX "native" stacks + + # Not supported unless '-mptx=7.3'+ and '-march=sm_52'+. + if { !([check_nvptx_default_ptx_isa_version_at_least 7 3] + && [check_nvptx_default_ptx_isa_target_architecture_at_least sm_52]) } { + return 0 + } + + # Find 'dg-do-what' in an outer frame. + set level 1 + while true { + upvar $level dg-do-what dg-do-what + if [info exists dg-do-what] then break + incr level + } + verbose "check_effective_target_alloca: found dg-do-what at level $level" 2 + + if { [string equal [lindex ${dg-do-what} 0] run] } { + # For 'dg-do run', it additionally depends on runtime support. + # (If not supported, we don't try to demote 'run' to 'link', + # but instead simply fail the effective-target 'alloca' check.) + return [check_effective_target_nvptx_runtime_alloca_ptx] + } else { + return 1 + } } else { + # '-msoft-stack' return 1 } } @@ -14100,6 +14128,35 @@ proc check_effective_target_nvptx_default_ptx_isa_version_at_least_6_0 { } { return [check_nvptx_default_ptx_isa_version_at_least 6 0] } +# Return 1 if nvptx code by default compiles for at least the specified PTX ISA +# target architecture. + +proc check_nvptx_default_ptx_isa_target_architecture_at_least { ta } { + set name nvptx_default_ptx_isa_target_architecture_at_least_${ta} + + if [regexp {^sm_(\d+)$} $ta dummy ptx_sm] { + set ptx_sm "${ptx_sm}0" + } else { + error "check_nvptx_default_ptx_isa_target_architecture_at_least: illegal argument: $ta" + } + + set supported_p \ + [concat \ + "(__PTX_SM__ >= $ptx_sm)"] + + set src \ + [list \ + "#if $supported_p" \ + "#else" \ + "#error unsupported" \ + "#endif"] + set src [join $src "\n"] + + set res [check_no_compiler_messages $name assembly $src ""] + + return $res +} + # Return 1 if nvptx '-msoft-stack' is enabled. proc check_effective_target_nvptx_softstack { } { @@ -14132,6 +14189,28 @@ proc check_nvptx_runtime_ptx_isa_version_at_least { major minor } { return $res } +# Return 1 if nvptx code with the specified PTX ISA target architecture or +# higher can be run. + +proc check_nvptx_runtime_ptx_isa_target_architecture_at_least { ta } { + set name nvptx_runtime_ptx_isa_target_architecture_${ta} + + set default \ + [check_nvptx_default_ptx_isa_target_architecture_at_least ${ta}] + + if { $default } { + set flag "" + } else { + set flag "-march=$ta -mptx=_" + } + + set res [check_runtime $name { + int main (void) { return 0; } + } $flag] + + return $res +} + # Return 1 if the nvptx runtime environment supports the PTX ISA directive # '.alias'. @@ -14139,6 +14218,13 @@ proc check_effective_target_nvptx_runtime_alias_ptx { } { return [check_nvptx_runtime_ptx_isa_version_at_least 6 3] } +# Return 1 if the nvptx runtime environment supports PTX 'alloca'. + +proc check_effective_target_nvptx_runtime_alloca_ptx { } { + return [expr { [check_nvptx_runtime_ptx_isa_version_at_least 7 3] + && [check_nvptx_runtime_ptx_isa_target_architecture_at_least sm_52] }] +} + # Add options to enable nvptx using the PTX ISA directive '.alias'. proc add_options_for_nvptx_alias_ptx { flags } { @@ -14150,3 +14236,20 @@ proc add_options_for_nvptx_alias_ptx { flags } { return $flags } + +# Add options to enable nvptx using PTX 'alloca'. + +proc add_options_for_nvptx_alloca_ptx { flags } { + # We don't add '-mno-soft-stack' here; the users should take care of that + # explicitly. + + if { ![check_nvptx_default_ptx_isa_version_at_least 7 3] } { + append flags " -mptx=7.3" + } + + if { ![check_nvptx_default_ptx_isa_target_architecture_at_least sm_52] } { + append flags " -march=sm_52" + } + + return $flags +} diff --git a/libgomp/fortran.c b/libgomp/fortran.c index 087cb49945ca..7a0386e5ccb3 100644 --- a/libgomp/fortran.c +++ b/libgomp/fortran.c @@ -846,8 +846,8 @@ omp_get_device_from_uid_ (const char *uid, size_t uid_len) /* Inside the target region, invoking this routine is undefined behavior; thus, resolve it already here - instead of inside libgomp/config/.../target.c. - Note that on nvptx __builtin_alloca is defined, but fails with a sorry - during compilation, as it is unsupported until isa 7.3 / sm_52. */ + This also circumvents issues due to not all nvptx configurations + supporting 'alloca'. */ return omp_invalid_device; #endif } diff --git a/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-2.f90 index 8cf79a10e8d2..14e8f99d391e 100644 --- a/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-2.f90 +++ b/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-2.f90 @@ -1,12 +1,5 @@ ! { dg-do run } -! PR65181 "Support for alloca in nvptx" -! { dg-excess-errors "lto1, mkoffload and lto-wrapper fatal errors" { target openacc_nvidia_accel_selected } } -! Aside from restricting this testcase to non-nvptx offloading, and duplicating -! it with 'dg-do link' for nvptx offloading, there doesn't seem to be a way to -! XFAIL the "UNRESOLVED: [...] compilation failed to produce executable", or -! get rid of it, unfortunately. - ! { dg-additional-options "-fopt-info-note-omp" } ! { dg-additional-options "--param=openacc-privatization=noisy" } ! { dg-additional-options "-foffload=-fopt-info-note-omp" } @@ -59,7 +52,6 @@ contains ! { dg-note {variable 'array' in 'private' clause is candidate for adjusting OpenACC privatization level} "" { target *-*-* } l_loop$c_loop } ! { dg-note {variable 'array' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop } ! { dg-note {variable 'array' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop } - ! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop } do i = 1, 10 array(i) = i end do @@ -91,7 +83,6 @@ contains ! { dg-note {variable 'array\.[0-9]+' in 'private' clause is candidate for adjusting OpenACC privatization level} "" { target *-*-* } l_loop$c_loop } ! { dg-note {variable 'array\.[0-9]+' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop } ! { dg-note {variable 'array\.[0-9]+' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop } - ! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop } do i = 1, 10 array(i) = 9*i end do @@ -117,7 +108,6 @@ contains ! { dg-note {variable 'str' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop } ! { dg-note {variable 'str' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop } ! { dg-note {variable 'char\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} "" { target *-*-* } l_loop$c_loop } - ! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop } do i = 1, 10 str(i:i) = achar(ichar('A') + i) end do -- 2.34.1