Recent testing uncovered that PTX JIT may reject attempts to use 'mul.u32' as a non-widening 32-bit multiply instruction. Use 'mul.lo.u32' to fix 32-bit code generation and conform to the PTX spec better.
* config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Emit 'mul.lo.u32' instead of 'mul.u32' for 32-bit ABI target. (nvptx_declare_function_name): Ditto. --- gcc/ChangeLog.gomp-nvptx | 6 ++++++ gcc/config/nvptx/nvptx.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 93bf781..bc187ea 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -955,7 +955,7 @@ nvptx_init_unisimt_predicate (FILE *file) fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits); fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n"); fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n", - bits == 64 ? ".wide" : ""); + bits == 64 ? ".wide" : ".lo"); fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits); fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits); fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master); @@ -1115,7 +1115,7 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) fprintf (file, "\t.reg.u%d %%fstmp2;\n", bits); fprintf (file, "\tmov.u32 %%fstmp0, %%tid.y;\n"); fprintf (file, "\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n", - bits == 64 ? ".wide" : "", bits / 8); + bits == 64 ? ".wide" : ".lo", bits / 8); fprintf (file, "\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits); /* fstmp2 = &__nvptx_stacks[tid.y]; */ fprintf (file, "\tadd.u%d %%fstmp2, %%fstmp2, %%fstmp1;\n", bits);