This patch teaches the nvptx vector state propagator how to handle QImode and HImode variables. Basically, I'm converting the 8- and 16-bit values into 32-bits so that the shuffle broadcast can be used to propagate the register.
I'm not sure if my solution is the best way to resolve this problem. It looks like the nvptx backend frequently assigns a larger .u16 and .u32 register for chars and shorts, and consequently masks this problem in -O0. Because a lot of the registers are already u32, the conversion to and from u8 and u16 seems like an unnecessary step, when the nvptx backend should be able to broadcast the origin u32 register directly. Is there a better way to resolve this issue, or is this patch OK for trunk as-is? Cesar
2016-02-22 Cesar Philippidis <ce...@codesourcery.com> gcc/ * config/nvptx/nvptx.c (nvptx_gen_shuffle): Add support for QImode and HImode register. libgomp/ * testsuite/libgomp.oacc-c-c++-common/vprop.c: New test. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 3faacd5..728cb00 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -1306,6 +1306,20 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind) end_sequence (); } break; + case QImode: + case HImode: + { + rtx tmp = gen_reg_rtx (SImode); + + start_sequence (); + emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src))); + emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind)); + emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst), + tmp))); + res = get_insns (); + end_sequence (); + } + break; default: gcc_unreachable (); diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c new file mode 100644 index 0000000..a9b63dc --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c @@ -0,0 +1,34 @@ +#include <assert.h> + +#define test(type) \ +void \ +test_##type () \ +{ \ + type b[100]; \ + type i, j, x = -1, y = -1; \ + \ + _Pragma("acc parallel loop copyout (b)") \ + for (j = 0; j > -5; j--) \ + { \ + type c = x+y; \ + _Pragma("acc loop vector") \ + for (i = 0; i < 20; i++) \ + b[-j*20 + i] = c; \ + b[5-j] = c; \ + } \ + \ + for (i = 0; i < 100; i++) \ + assert (b[i] == -2); \ +} + +test(char) +test(short) + +int +main () +{ + test_char (); + test_short (); + + return 0; +}