https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81069
--- Comment #3 from Tom de Vries <vries at gcc dot gnu.org> ---
The immediate reason for the failure is a bug in the ptx JIT compiler.
This piece of ptx code (containing two functionally equivalent but differently
layed out variants, DETOUR=0 and DETOUR=1) is a minimal example, and is
translated correctly for DETOUR=0 and incorrectly for DETOUR=1:
...
.version 3.1
.target sm_30
.address_size 64
.entry test_function (.param .u64 %in_ar0);
.entry test_function (.param .u64 %in_ar0)
{
// Init %ra to 0 in all lanes.
.reg .u32 %ra;
mov.u32 %ra,0;
// Branch around if not lane 0.
{
.reg .u32 %l;
mov.u32 %l,%laneid;
.reg .pred %lane0;
setp.eq.u32 %lane0,%l,0;
@ ! %lane0 bra $L20;
}
bra $L3;
#if DETOUR == 0
$L3:
// Set %ra to 1 in lane zero
mov.u32 %ra,1;
$L20:
bra $L8;
#endif
$L8:
// Broadcast %ra from lane 0 to all lanes
shfl.idx.b32 %ra,%ra,0,31;
// Branch to trap if %ra not 1 in all lanes
{
.reg .pred %ra_is_one;
setp.eq.u32 %ra_is_one,%ra,1;
@ ! %ra_is_one bra $L6;
}
ret;
$L6:
trap;
#if DETOUR == 1
$L3:
// Set %ra to 1 in lane zero
mov.u32 %ra,1;
$L20:
bra $L8;
#endif
}
...
The problem for the DETOUR=1 case is that while the divergent branch '@ !
%lane0 bra $L20' is post-dominated by $L20, the synchronization point is
inserted much later, after the shfl. This means the shfl is executed in
divergent state, which invokes undefined behaviour, causing an undefined result
%ra in one of the threads in the warp, which causes the example to call the
trap.