https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96428

--- Comment #4 from Tom de Vries <vries at gcc dot gnu.org> ---
FTR, this is not the leanest solution.

This patch generates:
...
                cvt.u64.u64     %r74, %r65.x;
                cvt.u64.u64     %r75, %r65.y;
                mov.b64 {%r76,%r77}, %r74;
                shfl.idx.b32    %r76, %r76, 0, 31;
                shfl.idx.b32    %r77, %r77, 0, 31;
                mov.b64 %r74, {%r76,%r77};
                mov.b64 {%r78,%r79}, %r75;
                shfl.idx.b32    %r78, %r78, 0, 31;
                shfl.idx.b32    %r79, %r79, 0, 31;
                mov.b64 %r75, {%r78,%r79};
                cvt.u64.u64     %r65.x, %r74;
                cvt.u64.u64     %r65.y, %r75;
...

but using this followup patch:
...
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index cf53a921e5b..84df8e1ca4a 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -1821,15 +1821,9 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx,
nvptx_shuffle_kind kind)
        rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
        rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
        rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
-       rtx tmp0 = gen_reg_rtx (DImode);
-       rtx tmp1 = gen_reg_rtx (DImode);
        start_sequence ();
-       emit_insn (gen_movdi (tmp0, src0));
-       emit_insn (gen_movdi (tmp1, src1));
-       emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
-       emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
-       emit_insn (gen_movdi (dst0, tmp0));
-       emit_insn (gen_movdi (dst1, tmp1));
+       emit_insn (nvptx_gen_shuffle (dst0, src0, idx, kind));
+       emit_insn (nvptx_gen_shuffle (dst1, src1, idx, kind));
        res = get_insns ();
        end_sequence ();
       }
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index c23edcf34bf..6e81ad449b3 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -176,6 +176,11 @@
   "A pseudo register."
   (match_code "reg"))

+(define_constraint "Q"
+  "A pseudo register."
+  (ior (match_code "reg")
+       (match_code "subreg")))
+
 (define_constraint "Ia"
   "Any integer constant."
   (and (match_code "const_int") (match_test "true")))
@@ -1513,21 +1518,23 @@
 ;; extract parts of a 64 bit object into 2 32-bit ints
 (define_insn "unpack<mode>si2"
   [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-        (unspec:SI [(match_operand:BITD 2 "nvptx_register_operand" "R")
+        (unspec:SI [(match_operand:BITD 2 "register_operand" "Q")
                    (const_int 0)] UNSPEC_BIT_CONV))
    (set (match_operand:SI 1 "nvptx_register_operand" "=R")
         (unspec:SI [(match_dup 2) (const_int 1)] UNSPEC_BIT_CONV))]
   ""
-  "%.\\tmov.b64\\t{%0,%1}, %2;")
+  "%.\\tmov.b64\\t{%0,%1}, %2;"
+  [(set_attr "subregs_ok" "true")])

 ;; pack 2 32-bit ints into a 64 bit object
 (define_insn "packsi<mode>2"
-  [(set (match_operand:BITD 0 "nvptx_register_operand" "=R")
+  [(set (match_operand:BITD 0 "register_operand" "=Q")
         (unspec:BITD [(match_operand:SI 1 "nvptx_register_operand" "R")
                      (match_operand:SI 2 "nvptx_register_operand" "R")]
                    UNSPEC_BIT_CONV))]
   ""
-  "%.\\tmov.b64\\t%0, {%1,%2};")
+  "%.\\tmov.b64\\t%0, {%1,%2};"
+  [(set_attr "subregs_ok" "true")])

 ;; Atomic insns.

...

we have instead:
...
                mov.b64 {%r74,%r75}, %r65.x;
                shfl.idx.b32    %r74, %r74, 0, 31;            
                shfl.idx.b32    %r75, %r75, 0, 31;
                mov.b64 %r65.x, {%r74,%r75};
...

But for an ICE fix, I'd rather keep things simple.

Reply via email to