On 01/18/2018 02:27 PM, Tom de Vries wrote:
On 01/18/2018 12:40 AM, Cesar Philippidis wrote:
In PR83920, I encountered a nvptx bug where live predicate variables
were clobbered before their value was broadcasted.

Hi,

I've managed to reproduce the problem based on the description in the PR.

I think the way to address it is using a tmp .pred reg like so:
...
{
   .reg .u32 %x;
   mov.u32 %x,%tid.x;
   setp.ne.u32 %rnotvzero,%x,0;
}

{
   .reg .pred %rcond2;
   setp.eq.u32 %rcond2, 1, 0; // workaround

   @%rnotvzero bra Lskip;
   ...
   setp.<op>.<type> %rcond,op1,op2; // could be here, could be earlier
   mov.b1 %rcond2, %rcond; // used pseudo opcode mov.b1 for convenience
  Lskip:
   selp.u32 %rcondu32,1,0,%rcond2;
   shfl.idx.b32 %rcondu32,%rcondu32,0,31;
   setp.ne.u32 %rcond,%rcondu32,0;
}
...


Hi,

this is the fix that I plan to commit (similar to the scheme listed above, but modified to keep the selp.u32 using rcond, which is easier in code generation).

Build and reg-tested on x86_64 with nvptx accelerator.

Richard, this is an 8 regression for the nvptx target. OK for stage 4 or defer to stage1?

Thanks,
- Tom
[nvptx] Fix bug in jit bug workaround

2018-01-19  Tom de Vries  <t...@codesourcery.com>
	    Cesar Philippidis  <ce...@codesourcery.com>

	PR target/83920

	* config/nvptx/nvptx.c (nvptx_single): Fix jit workaround.

	* testsuite/libgomp.oacc-c-c++-common/pr83920.c: New test.
	* testsuite/libgomp.oacc-fortran/pr83920.f90: New test.

---
 gcc/config/nvptx/nvptx.c                           | 28 +++++++++++++++++--
 .../testsuite/libgomp.oacc-c-c++-common/pr83920.c  | 32 ++++++++++++++++++++++
 libgomp/testsuite/libgomp.oacc-fortran/pr83920.f90 | 28 +++++++++++++++++++
 3 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 86fc13f4fc0..afb0e4dd185 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4096,9 +4096,33 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 
 	     There is nothing in the PTX spec to suggest that this is wrong, or
 	     to explain why the extra initialization is needed.  So, we classify
-	     it as a JIT bug, and the extra initialization as workaround.  */
-	  emit_insn_before (gen_movbi (pvar, const0_rtx),
+	     it as a JIT bug, and the extra initialization as workaround:
+
+		{
+		    .reg .u32 %x;
+		    mov.u32 %x,%tid.x;
+		    setp.ne.u32 %rnotvzero,%x,0;
+		}
+
+		+.reg .pred %rcond2;
+		+setp.eq.u32 %rcond2, 1, 0;
+
+		 @%rnotvzero bra Lskip;
+		 setp.<op>.<type> %rcond,op1,op2;
+		+mov.pred %rcond2, %rcond;
+		 Lskip:
+		+mov.pred %rcond, %rcond2;
+		 selp.u32 %rcondu32,1,0,%rcond;
+		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
+		 setp.ne.u32 %rcond,%rcondu32,0;
+	  */
+	  rtx_insn *label = PREV_INSN (tail);
+	  gcc_assert (label && LABEL_P (label));
+	  rtx tmp = gen_reg_rtx (BImode);
+	  emit_insn_before (gen_movbi (tmp, const0_rtx),
 			    bb_first_real_insn (from));
+	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
+	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
 #endif
 	  emit_insn_before (nvptx_gen_vcast (pvar), tail);
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83920.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83920.c
new file mode 100644
index 00000000000..6cd3b5d6f06
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83920.c
@@ -0,0 +1,32 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+#define n 10
+
+static void __attribute__((noinline)) __attribute__((noclone))
+foo (int beta, int *c)
+{
+  #pragma acc parallel copy(c[0:(n * n) - 1]) num_gangs(2)
+  #pragma acc loop gang
+  for (int j = 0; j < n; ++j)
+    if (beta != 1)
+      {
+        #pragma acc loop vector
+	for (int i = 0; i < n; ++i)
+	  c[i + (j * n)] = 0;
+      }
+}
+
+int
+main (void)
+{
+  int c[n * n];
+
+  c[0] = 1;
+  foo (0, c);
+  if (c[0] != 0)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr83920.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr83920.f90
new file mode 100644
index 00000000000..34ad001abcd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr83920.f90
@@ -0,0 +1,28 @@
+! { dg-do run }
+
+subroutine foo (BETA, C)
+  real ::  C(100,100)
+  integer :: i, j, l
+  real, parameter :: one = 1.0
+  real :: beta
+
+  !$acc parallel copy(c(1:100,1:100)) num_gangs(2)
+  !$acc loop gang
+  do j = 1, 100
+     if (beta /= one) then
+        !$acc loop vector
+        do i = 1, 100
+           C(i,j) = 0.0
+        end do
+     end if
+  end do
+  !$acc end parallel
+end subroutine foo
+
+program test_foo
+  real :: c(100,100), beta
+  beta = 0.0
+  c(:,:) = 1.0
+  call foo (beta, c)
+  if (c(1,1) /= 0.0) call abort ()
+end program test_foo

Reply via email to