https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96428
--- Comment #4 from Tom de Vries <vries at gcc dot gnu.org> --- FTR, this is not the leanest solution. This patch generates: ... cvt.u64.u64 %r74, %r65.x; cvt.u64.u64 %r75, %r65.y; mov.b64 {%r76,%r77}, %r74; shfl.idx.b32 %r76, %r76, 0, 31; shfl.idx.b32 %r77, %r77, 0, 31; mov.b64 %r74, {%r76,%r77}; mov.b64 {%r78,%r79}, %r75; shfl.idx.b32 %r78, %r78, 0, 31; shfl.idx.b32 %r79, %r79, 0, 31; mov.b64 %r75, {%r78,%r79}; cvt.u64.u64 %r65.x, %r74; cvt.u64.u64 %r65.y, %r75; ... but using this followup patch: ... diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index cf53a921e5b..84df8e1ca4a 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -1821,15 +1821,9 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind) rtx src1 = gen_rtx_SUBREG (DImode, src, 8); rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0); rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8); - rtx tmp0 = gen_reg_rtx (DImode); - rtx tmp1 = gen_reg_rtx (DImode); start_sequence (); - emit_insn (gen_movdi (tmp0, src0)); - emit_insn (gen_movdi (tmp1, src1)); - emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind)); - emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind)); - emit_insn (gen_movdi (dst0, tmp0)); - emit_insn (gen_movdi (dst1, tmp1)); + emit_insn (nvptx_gen_shuffle (dst0, src0, idx, kind)); + emit_insn (nvptx_gen_shuffle (dst1, src1, idx, kind)); res = get_insns (); end_sequence (); } diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index c23edcf34bf..6e81ad449b3 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -176,6 +176,11 @@ "A pseudo register." (match_code "reg")) +(define_constraint "Q" + "A pseudo register." + (ior (match_code "reg") + (match_code "subreg"))) + (define_constraint "Ia" "Any integer constant." (and (match_code "const_int") (match_test "true"))) @@ -1513,21 +1518,23 @@ ;; extract parts of a 64 bit object into 2 32-bit ints (define_insn "unpack<mode>si2" [(set (match_operand:SI 0 "nvptx_register_operand" "=R") - (unspec:SI [(match_operand:BITD 2 "nvptx_register_operand" "R") + (unspec:SI [(match_operand:BITD 2 "register_operand" "Q") (const_int 0)] UNSPEC_BIT_CONV)) (set (match_operand:SI 1 "nvptx_register_operand" "=R") (unspec:SI [(match_dup 2) (const_int 1)] UNSPEC_BIT_CONV))] "" - "%.\\tmov.b64\\t{%0,%1}, %2;") + "%.\\tmov.b64\\t{%0,%1}, %2;" + [(set_attr "subregs_ok" "true")]) ;; pack 2 32-bit ints into a 64 bit object (define_insn "packsi<mode>2" - [(set (match_operand:BITD 0 "nvptx_register_operand" "=R") + [(set (match_operand:BITD 0 "register_operand" "=Q") (unspec:BITD [(match_operand:SI 1 "nvptx_register_operand" "R") (match_operand:SI 2 "nvptx_register_operand" "R")] UNSPEC_BIT_CONV))] "" - "%.\\tmov.b64\\t%0, {%1,%2};") + "%.\\tmov.b64\\t%0, {%1,%2};" + [(set_attr "subregs_ok" "true")]) ;; Atomic insns. ... we have instead: ... mov.b64 {%r74,%r75}, %r65.x; shfl.idx.b32 %r74, %r74, 0, 31; shfl.idx.b32 %r75, %r75, 0, 31; mov.b64 %r65.x, {%r74,%r75}; ... But for an ICE fix, I'd rather keep things simple.