https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/117822
For multipass instructions, overlap on VDST and SRC’s would result in HW race & undefined results. Co-authored-by: Pravin Jagtap <pravin.jag...@amd.com> >From 418ffe2eedd620829043e7585371464ec2dd44c7 Mon Sep 17 00:00:00 2001 From: Pravin Jagtap <pravin.jag...@amd.com> Date: Thu, 11 Jul 2024 05:12:42 -0400 Subject: [PATCH] AMDGPU: Allocate different registers for vdst & src in v_cvt_scalef32* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For multipass instructions, overlap on VDST and SRC’s would result in HW race & undefined results. Co-authored-by: Pravin Jagtap <pravin.jag...@amd.com> --- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 12 +- .../llvm.amdgcn.cvt.scalef32.pk.gfx950.ll | 214 +++++++++++++----- .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll | 168 +++++++------- .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll | 48 ++-- .../AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll | 168 +++++++------- 5 files changed, 353 insertions(+), 257 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 00caea1f923391..9ef52c0feb7233 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1088,9 +1088,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>; - defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>; - defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>; - defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>; + let Constraints = "@earlyclobber $vdst" in { + defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>; + defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>; + defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>; + } } defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>; defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>; @@ -1103,7 +1105,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in } } -let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in { +let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in { defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_fp6>; defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_bf6>; defm V_CVT_SCALEF32_PK32_F16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_fp6>; @@ -1112,7 +1114,7 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_bf6>; } -let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in { +let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in { defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_f16>; defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>; defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index 6d627186d25816..f80f2935856e36 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -864,31 +864,91 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3(i32 %src, float %scale) { } define <32 x float> @test_cvt_scale_pk32_f32_fp6(<6 x i32> %src, float %scale) { -; GCN-LABEL: test_cvt_scale_pk32_f32_fp6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[0:5], v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale) ret <32 x float> %ret } define <32 x float> @test_cvt_scale_pk32_f32_bf6(<6 x i32> %src, float %scale) { -; GCN-LABEL: test_cvt_scale_pk32_f32_bf6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[0:5], v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale) ret <32 x float> %ret } define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv(<6 x i32> %src, float %scale) { -; GCN-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale) ret <32 x half> %ret } @@ -897,14 +957,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) { ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], s0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl: @@ -912,11 +972,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) { ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0) ret <32 x half> %ret @@ -926,7 +986,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float % ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv: @@ -958,14 +1025,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) { ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], s0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl: @@ -1000,11 +1067,31 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) { } define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv(<6 x i32> %src, float %scale) { -; GCN-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale) ret <32 x half> %ret } @@ -1013,14 +1100,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) { ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], s0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl: @@ -1028,11 +1115,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) { ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0) ret <32 x half> %ret @@ -1042,7 +1129,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float % ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv: @@ -1074,14 +1168,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) { ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], s0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll index 4153bc8f43563b..f9fd7e253b1243 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll @@ -10,24 +10,24 @@ declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[0:5], v[0:15], v16 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v18 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v6 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v7 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v9 @@ -40,10 +40,10 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 @@ -54,9 +54,9 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[0:5], v[0:15], v16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -83,9 +83,9 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[2:7], v[2:17], s0 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl: @@ -162,10 +162,10 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[2:7], v[2:17], v18 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -175,20 +175,20 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src, define amdgpu_ps void @test_scalef32_pk32_bf6_f16_vv(<32 x half> %src, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[0:5], v[0:15], v16 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[0:15], v16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[0:5], v[0:15], v16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v18 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[0:15], v16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -215,9 +215,9 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl(<32 x half> inreg %src, ptr ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[2:7], v[2:17], s0 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_sl: @@ -230,10 +230,10 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl(<32 x half> inreg %src, ptr ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[2:7], v[2:17], v18 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -243,24 +243,24 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl(<32 x half> inreg %src, ptr define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[0:5], v[0:15], v16 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[0:15], v16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v18 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v6 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v7 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v9 @@ -273,10 +273,10 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 @@ -287,9 +287,9 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[0:5], v[0:15], v16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[0:15], v16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -316,9 +316,9 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[2:7], v[2:17], s0 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl: @@ -395,10 +395,10 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[2:7], v[2:17], v18 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -408,20 +408,20 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src, define amdgpu_ps void @test_scalef32_pk32_fp6_f16_vv(<32 x half> %src, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[0:5], v[0:15], v16 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[0:15], v16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[0:5], v[0:15], v16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v18 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[0:15], v16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -448,9 +448,9 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl(<32 x half> inreg %src, ptr ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[2:7], v[2:17], s0 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_sl: @@ -463,10 +463,10 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl(<32 x half> inreg %src, ptr ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[2:7], v[2:17], v18 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll index b64ca3fb67f605..1107b46f8f6d38 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll @@ -11,8 +11,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_0(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v5, v2, v3, v4 -; GFX950-NEXT: global_store_dword v[0:1], v5, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 +; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 0) @@ -25,8 +25,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_1(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v5, v2, v3, v4 op_sel:[0,0,1,0] -; GFX950-NEXT: global_store_dword v[0:1], v5, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 1) @@ -39,8 +39,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_2(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v5, v2, v3, v4 op_sel:[0,0,0,1] -; GFX950-NEXT: global_store_dword v[0:1], v5, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 2) @@ -53,8 +53,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_3(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v5, v2, v3, v4 op_sel:[0,0,1,1] -; GFX950-NEXT: global_store_dword v[0:1], v5, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,1] +; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 3) @@ -67,8 +67,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_0(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v5, v2, v3, v4 -; GFX950-NEXT: global_store_dword v[0:1], v5, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 +; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 0) @@ -81,8 +81,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0] -; GFX950-NEXT: global_store_dword v[0:1], v5, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 1) @@ -95,8 +95,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1] -; GFX950-NEXT: global_store_dword v[0:1], v5, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 2) @@ -109,8 +109,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_3(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v5, v2, v3, v4 op_sel:[0,0,1,1] -; GFX950-NEXT: global_store_dword v[0:1], v5, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,1] +; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 3) @@ -123,8 +123,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_0(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v6, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v6, v[2:3], v4, v5 -; GFX950-NEXT: global_store_dword v[0:1], v6, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 +; GFX950-NEXT: global_store_dword v[0:1], v7, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 0) @@ -137,8 +137,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_1(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v6, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v6, v[2:3], v4, v5 op_sel:[0,0,1,0] -; GFX950-NEXT: global_store_dword v[0:1], v6, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,0] +; GFX950-NEXT: global_store_dword v[0:1], v7, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 1) @@ -151,8 +151,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_2(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v6, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v6, v[2:3], v4, v5 op_sel:[0,0,0,1] -; GFX950-NEXT: global_store_dword v[0:1], v6, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,0,1] +; GFX950-NEXT: global_store_dword v[0:1], v7, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 2) @@ -165,8 +165,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_3(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v6, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v6, v[2:3], v4, v5 op_sel:[0,0,1,1] -; GFX950-NEXT: global_store_dword v[0:1], v6, off +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,1] +; GFX950-NEXT: global_store_dword v[0:1], v7, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 3) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll index 3e9ac6cbe3ba6e..0d4598f316c411 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll @@ -12,9 +12,9 @@ declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> %src, i define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[0:15], v16, v17 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], v[0:15], v16, v17 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv: @@ -52,9 +52,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[0:15], v16, v17 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], v[0:15], v16, v17 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -80,10 +80,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr ; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[2:7], v[2:17], s16, v18 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[18:23], v[2:17], s16, v24 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_sl: @@ -160,10 +160,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[2:7], v[2:17], s16, v18 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[18:23], v[2:17], s16, v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -173,16 +173,16 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f16_vv(<32 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_bf6_f16_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[0:15], v16, v17 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], v[0:15], v16, v17 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_f16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[0:15], v16, v17 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], v[0:15], v16, v17 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -208,10 +208,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f16_sl(<32 x half> inreg %src, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_f16 v[2:7], v[2:17], s16, v18 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_f16 v[18:23], v[2:17], s16, v24 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_f16_sl: @@ -224,10 +224,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f16_sl(<32 x half> inreg %src, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_f16 v[2:7], v[2:17], s16, v18 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_f16 v[18:23], v[2:17], s16, v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -237,9 +237,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f16_sl(<32 x half> inreg %src, define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_vv(<32 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_fp6_bf16_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[0:15], v16, v17 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], v[0:15], v16, v17 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_vv: @@ -277,9 +277,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_vv(<32 x bfloat> %src, i32 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[0:15], v16, v17 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], v[0:15], v16, v17 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -305,10 +305,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr ; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[2:7], v[2:17], s16, v18 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[18:23], v[2:17], s16, v24 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_sl: @@ -385,10 +385,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[2:7], v[2:17], s16, v18 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[18:23], v[2:17], s16, v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -398,16 +398,16 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f16_vv(<32 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_fp6_f16_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[0:15], v16, v17 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], v[0:15], v16, v17 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_f16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[0:15], v16, v17 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], v[0:15], v16, v17 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> %src, i32 %sr, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -433,10 +433,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f16_sl(<32 x half> inreg %src, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_f16 v[2:7], v[2:17], s16, v18 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_f16 v[18:23], v[2:17], s16, v24 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_f16_sl: @@ -449,10 +449,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f16_sl(<32 x half> inreg %src, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_f16 v[2:7], v[2:17], s16, v18 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_f16 v[18:23], v[2:17], s16, v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> %src, i32 %sr, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -462,16 +462,16 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f16_sl(<32 x half> inreg %src, define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f32_vv(<32 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_bf6_f32_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[0:31], v32, v33 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[34:35], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[34:35], v[0:3], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], v[0:31], v32, v33 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[34:35], v[40:41], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[34:35], v[36:39], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_f32_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[0:31], v32, v33 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[34:35], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[34:35], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], v[0:31], v32, v33 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[34:35], v[36:39], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[34:35], v[40:41], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> %src, i32 %sr, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -513,10 +513,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f32_sl(<32 x float> inreg %src, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v31, s29 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, s30 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, s31 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_f32 v[2:7], v[2:33], s32, v34 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v40, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_f32 v[34:39], v[2:33], s32, v40 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[38:39], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[34:37], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_f32_sl: @@ -537,10 +537,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f32_sl(<32 x float> inreg %src, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_f32 v[2:7], v[2:33], s32, v34 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v40, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_f32 v[34:39], v[2:33], s32, v40 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[34:37], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[38:39], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> %src, i32 %sr, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -550,16 +550,16 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f32_sl(<32 x float> inreg %src, define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f32_vv(<32 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_fp6_f32_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[0:31], v32, v33 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[34:35], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[34:35], v[0:3], off +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], v[0:31], v32, v33 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[34:35], v[40:41], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[34:35], v[36:39], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_f32_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[0:31], v32, v33 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[34:35], v[0:3], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[34:35], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], v[0:31], v32, v33 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[34:35], v[36:39], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[34:35], v[40:41], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> %src, i32 %sr, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -601,10 +601,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f32_sl(<32 x float> inreg %src, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v31, s29 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, s30 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, s31 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_f32 v[2:7], v[2:33], s32, v34 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v40, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_fp6_f32 v[34:39], v[2:33], s32, v40 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[38:39], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[34:37], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_f32_sl: @@ -625,10 +625,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f32_sl(<32 x float> inreg %src, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_f32 v[2:7], v[2:33], s32, v34 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v40, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_f32 v[34:39], v[2:33], s32, v40 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[34:37], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[38:39], off offset:16 ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> %src, i32 %sr, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits