nhaehnle updated this revision to Diff 286898.
nhaehnle added a comment.
Don't duplicate the intrinsics. Rely on D86317
<https://reviews.llvm.org/D86317> to reduce the pain of this
change caused to downstream users.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D86154/new/
https://reviews.llvm.org/D86154
Files:
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenOpenCL/builtins-amdgcn.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
Index: llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
===================================================================
--- llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2507,8 +2507,8 @@
define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
; CHECK-LABEL: @readfirstlane_constant(
-; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT: store volatile i32 [[VAR]], i32* undef, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT: store volatile i32 [[TMP1]], i32* undef, align 4
; CHECK-NEXT: store volatile i32 0, i32* undef, align 4
; CHECK-NEXT: store volatile i32 123, i32* undef, align 4
; CHECK-NEXT: store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4
@@ -2530,8 +2530,8 @@
define i32 @readfirstlane_idempotent(i32 %arg) {
; CHECK-LABEL: @readfirstlane_idempotent(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT: ret i32 [[READ0]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT: ret i32 [[TMP1]]
;
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
%read1 = call i32 @llvm.amdgcn.readfirstlane(i32 %read0)
@@ -2541,8 +2541,8 @@
define i32 @readfirstlane_readlane(i32 %arg) {
; CHECK-LABEL: @readfirstlane_readlane(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT: ret i32 [[READ0]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT: ret i32 [[TMP1]]
;
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
%read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0)
@@ -2552,11 +2552,11 @@
define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) {
; CHECK-LABEL: @readfirstlane_readfirstlane_different_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
-; CHECK-NEXT: ret i32 [[READ1]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
+; CHECK-NEXT: ret i32 [[TMP1]]
;
bb0:
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2570,11 +2570,11 @@
define i32 @readfirstlane_readlane_different_block(i32 %arg) {
; CHECK-LABEL: @readfirstlane_readlane_different_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 0)
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
-; CHECK-NEXT: ret i32 [[READ1]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
+; CHECK-NEXT: ret i32 [[TMP1]]
;
bb0:
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 0)
@@ -2585,6 +2585,41 @@
ret i32 %read1
}
+define i32 @readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: @readfirstlane_bitcast(
+; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT: ret i32 [[TMP2]]
+;
+ %bitcast.arg = bitcast float %arg to i32
+ %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+ ret i32 %read
+}
+
+define float @bitcast_readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: @bitcast_readfirstlane_bitcast(
+; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG:%.*]])
+; CHECK-NEXT: ret float [[TMP1]]
+;
+ %bitcast.arg = bitcast float %arg to i32
+ %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+ %cast.read = bitcast i32 %read to float
+ ret float %cast.read
+}
+
+define i32 @readfirstlane_bitcast_multi_use(float %arg) {
+; CHECK-LABEL: @readfirstlane_bitcast_multi_use(
+; CHECK-NEXT: store float [[ARG:%.*]], float* undef, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]])
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT: ret i32 [[TMP2]]
+;
+ %bitcast.arg = bitcast float %arg to i32
+ store i32 %bitcast.arg, i32* undef
+ %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+ ret i32 %read
+}
+
; --------------------------------------------------------------------
; llvm.amdgcn.readlane
; --------------------------------------------------------------------
@@ -2593,8 +2628,8 @@
define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
; CHECK-LABEL: @readlane_constant(
-; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7)
-; CHECK-NEXT: store volatile i32 [[VAR]], i32* undef, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 7)
+; CHECK-NEXT: store volatile i32 [[TMP1]], i32* undef, align 4
; CHECK-NEXT: store volatile i32 0, i32* undef, align 4
; CHECK-NEXT: store volatile i32 123, i32* undef, align 4
; CHECK-NEXT: store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4
@@ -2616,8 +2651,8 @@
define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
; CHECK-LABEL: @readlane_idempotent(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
-; CHECK-NEXT: ret i32 [[READ0]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT: ret i32 [[TMP1]]
;
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
%read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane)
@@ -2626,9 +2661,9 @@
define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
; CHECK-LABEL: @readlane_idempotent_different_lanes(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]])
-; CHECK-NEXT: ret i32 [[READ1]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP1]], i32 [[LANE1:%.*]])
+; CHECK-NEXT: ret i32 [[TMP2]]
;
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
%read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1)
@@ -2637,8 +2672,8 @@
define i32 @readlane_readfirstlane(i32 %arg) {
; CHECK-LABEL: @readlane_readfirstlane(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT: ret i32 [[READ0]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT: ret i32 [[TMP1]]
;
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
%read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0)
@@ -2648,11 +2683,11 @@
define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) {
; CHECK-LABEL: @readlane_idempotent_different_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]])
-; CHECK-NEXT: ret i32 [[READ1]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP0]], i32 [[LANE]])
+; CHECK-NEXT: ret i32 [[TMP1]]
;
bb0:
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
@@ -2667,11 +2702,11 @@
define i32 @readlane_readfirstlane_different_block(i32 %arg) {
; CHECK-LABEL: @readlane_readfirstlane_different_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)
-; CHECK-NEXT: ret i32 [[READ1]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP0]], i32 0)
+; CHECK-NEXT: ret i32 [[TMP1]]
;
bb0:
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -3,6 +3,11 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+declare i8 addrspace(3)* @llvm.amdgcn.writelane.p3i8(i8 addrspace(3)*, i32, i8 addrspace(3)*) #0
+declare i16 @llvm.amdgcn.writelane.i16(i16, i32, i16) #0
+declare half @llvm.amdgcn.writelane.f16(half, i32, half) #0
+declare <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16>, i32, <3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float>, i32, <9 x float>) #0
; CHECK-LABEL: {{^}}test_writelane_sreg:
; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
@@ -79,6 +84,60 @@
ret void
}
+; CHECK-LABEL: {{^}}test_writelane_p3:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+ %writelane = call i8 addrspace(3)* @llvm.amdgcn.writelane.p3i8(i8 addrspace(3)* null, i32 15, i8 addrspace(3)* %src)
+ store i8 addrspace(3)* %writelane, i8 addrspace(3)* addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_i16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_i16(i16 addrspace(1)* %out, i16 %src) {
+ %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 1234, i32 15, i16 %src)
+ store i16 %writelane, i16 addrspace(1)* %out, align 2
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_f16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_f16(half addrspace(1)* %out, half %src) {
+ %writelane = call half @llvm.amdgcn.writelane.f16(half 1.0, i32 15, half %src)
+ store half %writelane, half addrspace(1)* %out, align 2
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_v3i16:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+ %writelane = call <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16> zeroinitializer, i32 15, <3 x i16> %src)
+ store <3 x i16> %writelane, <3 x i16> addrspace(1)* %out, align 2
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_v9f32:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+ %writelane = call <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float> zeroinitializer, i32 15, <9 x float> %src)
+ store <9 x float> %writelane, <9 x float> addrspace(1)* %out, align 2
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,6 +1,11 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
declare i32 @llvm.amdgcn.readlane(i32, i32) #0
+declare i8 addrspace(3)* @llvm.amdgcn.readlane.p3i8(i8 addrspace(3)*, i32) #0
+declare i16 @llvm.amdgcn.readlane.i16(i16, i32) #0
+declare half @llvm.amdgcn.readlane.f16(half, i32) #0
+declare <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16>, i32) #0
+declare <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float>, i32) #0
; CHECK-LABEL: {{^}}test_readlane_sreg_sreg:
; CHECK-NOT: v_readlane_b32
@@ -77,6 +82,60 @@
ret void
}
+; CHECK-LABEL: {{^}}test_readlane_p3:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+ %readlane = call i8 addrspace(3)* @llvm.amdgcn.readlane.p3i8(i8 addrspace(3)* %src, i32 15)
+ store i8 addrspace(3)* %readlane, i8 addrspace(3)* addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_i16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_i16(i16 addrspace(1)* %out, i16 %src) {
+ %readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 15)
+ store i16 %readlane, i16 addrspace(1)* %out, align 2
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_f16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_f16(half addrspace(1)* %out, half %src) {
+ %readlane = call half @llvm.amdgcn.readlane.f16(half %src, i32 15)
+ store half %readlane, half addrspace(1)* %out, align 2
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v3i16:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+ %readlane = call <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16> %src, i32 15)
+ store <3 x i16> %readlane, <3 x i16> addrspace(1)* %out, align 2
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v9f32:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v3, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v4, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v5, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v6, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v7, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v8, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v9, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v10, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+ %readlane = call <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float> %src, i32 15)
+ store <9 x float> %readlane, <9 x float> addrspace(1)* %out, align 2
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,11 +1,20 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
-declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32) #0
+declare float @llvm.amdgcn.readfirstlane.f32(float) #0
+declare <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half>) #0
+declare <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16>) #0
+declare i8 addrspace(3)* @llvm.amdgcn.readfirstlane.p3i8(i8 addrspace(3)*) #0
+declare i16 @llvm.amdgcn.readfirstlane.i16(i16) #0
+declare half @llvm.amdgcn.readfirstlane.f16(half) #0
+declare <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float>) #0
-; CHECK-LABEL: {{^}}test_readfirstlane:
+; CHECK-LABEL: {{^}}test_readfirstlane_i32:
; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+define void @test_readfirstlane_i32(i32 addrspace(1)* %out, i32 %src) #1 {
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src)
store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
ret void
}
@@ -15,7 +24,7 @@
; CHECK-NOT: [[SGPR_VAL]]
; CHECK: ; use [[SGPR_VAL]]
define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
ret void
}
@@ -25,7 +34,7 @@
; CHECK-NOT: [[VVAL]]
; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
define amdgpu_kernel void @test_readfirstlane_imm_fold(i32 addrspace(1)* %out) #1 {
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
ret void
}
@@ -36,7 +45,7 @@
; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %m0)
store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
ret void
}
@@ -51,7 +60,7 @@
; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 {
%sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr)
store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
ret void
}
@@ -62,10 +71,91 @@
define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
%alloca = alloca i32, addrspace(5)
%int = ptrtoint i32 addrspace(5)* %alloca to i32
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %int)
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
ret void
}
+; CHECK-LABEL: {{^}}test_readfirstlane_f32:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f32(float addrspace(1)* %out, float %src) #1 {
+ %readfirstlane = call float @llvm.amdgcn.readfirstlane.f32(float %src)
+ store float %readfirstlane, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v2f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %src) #1 {
+ %readfirstlane = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src)
+ store <2 x half> %readfirstlane, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v2i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %src) #1 {
+ %readfirstlane = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src)
+ store <2 x i16> %readfirstlane, <2 x i16> addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_p3:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+ %readfirstlane = call i8 addrspace(3)* @llvm.amdgcn.readfirstlane.p3i8(i8 addrspace(3)* %src)
+ store i8 addrspace(3)* %readfirstlane, i8 addrspace(3)* addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_i16(i16 addrspace(1)* %out, i16 %src) {
+ %readfirstlane = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src)
+ store i16 %readfirstlane, i16 addrspace(1)* %out, align 2
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f16(half addrspace(1)* %out, half %src) {
+ %readfirstlane = call half @llvm.amdgcn.readfirstlane.f16(half %src)
+ store half %readfirstlane, half addrspace(1)* %out, align 2
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v3i16:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+ %readfirstlane = call <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16> %src)
+ store <3 x i16> %readfirstlane, <3 x i16> addrspace(1)* %out, align 2
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v9f32:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v3
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v4
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v5
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v6
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v7
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v8
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v9
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v10
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+ %readfirstlane = call <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float> %src)
+ store <9 x float> %readfirstlane, <9 x float> addrspace(1)* %out, align 2
+ ret void
+}
+
attributes #0 = { nounwind readnone convergent }
attributes #1 = { nounwind }
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
@@ -1,11 +1,11 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel.*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s
# RUN: FileCheck -check-prefix=ERR %s < %t
-# ERR: remark: <unknown>:0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s)
+# ERR: remark: <unknown>:0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s32_s)
---
-name: readfirstlane_v
+name: readfirstlane_s32_v
legalized: true
regBankSelected: true
tracksRegLiveness: true
@@ -13,7 +13,7 @@
body: |
bb.0:
liveins: $vgpr0
- ; GCN-LABEL: name: readfirstlane_v
+ ; GCN-LABEL: name: readfirstlane_s32_v
; GCN: liveins: $vgpr0
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
@@ -24,7 +24,7 @@
...
---
-name: readfirstlane_v_imm
+name: readfirstlane_v_s32_imm
legalized: true
regBankSelected: true
tracksRegLiveness: true
@@ -32,7 +32,7 @@
body: |
bb.0:
- ; GCN-LABEL: name: readfirstlane_v_imm
+ ; GCN-LABEL: name: readfirstlane_v_s32_imm
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 123, implicit $exec
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_]]
; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY]]
@@ -44,7 +44,7 @@
# Make sure this fails to select
---
-name: readfirstlane_s
+name: readfirstlane_s32_s
legalized: true
regBankSelected: true
tracksRegLiveness: true
@@ -52,7 +52,7 @@
body: |
bb.0:
liveins: $sgpr0
- ; GCN-LABEL: name: readfirstlane_s
+ ; GCN-LABEL: name: readfirstlane_s32_s
; GCN: liveins: $sgpr0
; GCN: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
Index: llvm/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2203,7 +2203,7 @@
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
- (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -715,6 +715,8 @@
addPass(createAtomicExpandPass());
+ if (EnableAtomicOptimizations)
+ addPass(createAMDGPUAtomicOptimizerPass());
addPass(createAMDGPULowerIntrinsicsPass());
@@ -871,10 +873,6 @@
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
- if (EnableAtomicOptimizations) {
- addPass(createAMDGPUAtomicOptimizerPass());
- }
-
// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.
Index: llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -655,6 +655,17 @@
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
return IC.replaceInstUsesWith(II, Src);
}
+
+ // readfirstlane (bitcast x) -> bitcast (readfirstlane x)
+ Value *BitcastInput = nullptr;
+ if (match(Src,
+ PatternMatch::m_BitCast(PatternMatch::m_Value(BitcastInput)))) {
+ CallInst *NewCall =
+ IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+ {BitcastInput->getType()}, BitcastInput);
+ Value *NewCast = IC.Builder.CreateBitCast(NewCall, II.getType());
+ return IC.replaceInstUsesWith(II, NewCast);
+ }
} else {
// readlane (readlane x, y), y -> readlane x, y
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -224,6 +224,10 @@
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+ bool visitLaneIntrinsicInst(IntrinsicInst &I);
+ Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID,
+ Value *Data0, Value *Lane = nullptr,
+ Value *Data1 = nullptr);
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
@@ -1344,6 +1348,10 @@
switch (I.getIntrinsicID()) {
case Intrinsic::bitreverse:
return visitBitreverseIntrinsicInst(I);
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_writelane:
+ return visitLaneIntrinsicInst(I);
default:
return false;
}
@@ -1359,6 +1367,138 @@
return Changed;
}
+Value *AMDGPUCodeGenPrepare::buildLegalLaneIntrinsic(IRBuilder<> &B,
+ Intrinsic::ID IID,
+ Value *Data0, Value *Lane,
+ Value *Data1) {
+ Type *Ty = Data0->getType();
+
+ if (Ty == B.getInt32Ty()) {
+ Value *Args[3] = {Data0, Lane, Data1};
+ unsigned NumArgs = Data1 != nullptr ? 3 : Lane != nullptr ? 2 : 1;
+ return B.CreateIntrinsic(IID, {B.getInt32Ty()}, {Args, NumArgs});
+ }
+
+ if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+ Type *EltType = VecTy->getElementType();
+ bool is16Bit =
+ (EltType->isIntegerTy() && EltType->getIntegerBitWidth() == 16) ||
+ (EltType->isHalfTy());
+ int EC = VecTy->getElementCount().Min;
+
+ Value *Result = UndefValue::get(Ty);
+ for (int i = 0; i < EC; i += 1 + is16Bit) {
+ Value *EltData0;
+ Value *EltData1 = nullptr;
+
+ if (is16Bit) {
+ int Idxs[2] = {i, i + 1};
+ EltData0 = B.CreateShuffleVector(Data0, UndefValue::get(Ty), Idxs);
+ EltData0 = B.CreateBitCast(EltData0, B.getInt32Ty());
+ } else {
+ EltData0 = B.CreateExtractElement(Data0, i);
+ }
+
+ if (Data1) {
+ if (is16Bit) {
+ int Idxs[2] = {i, i + 1};
+ EltData1 = B.CreateShuffleVector(Data1, UndefValue::get(Ty), Idxs);
+ EltData1 = B.CreateBitCast(EltData1, B.getInt32Ty());
+ } else {
+ EltData1 = B.CreateExtractElement(Data1, i);
+ }
+ }
+
+ Value *EltResult =
+ buildLegalLaneIntrinsic(B, IID, EltData0, Lane, EltData1);
+
+ if (is16Bit) {
+ EltResult =
+ B.CreateBitCast(EltResult, FixedVectorType::get(EltType, 2));
+ for (int j = 0; j < 2; ++j) {
+ if (i + j >= EC)
+ break;
+ Result = B.CreateInsertElement(
+ Result, B.CreateExtractElement(EltResult, j), i + j);
+ }
+ } else {
+ Result = B.CreateInsertElement(Result, EltResult, i);
+ }
+ }
+
+ return Result;
+ }
+
+ unsigned BitWidth = DL->getTypeSizeInBits(Ty);
+ Type *IntTy = Ty;
+
+ if (!Ty->isIntegerTy()) {
+ IntTy = IntegerType::get(Mod->getContext(), BitWidth);
+ Data0 = B.CreateBitOrPointerCast(Data0, IntTy);
+ if (Data1)
+ Data1 = B.CreateBitOrPointerCast(Data1, IntTy);
+ }
+
+ if ((BitWidth % 32) != 0) {
+ Type *ExtendedTy =
+ IntegerType::get(Mod->getContext(), (BitWidth + 31) & ~31);
+ Data0 = B.CreateZExt(Data0, ExtendedTy);
+ if (Data1)
+ Data1 = B.CreateZExt(Data1, ExtendedTy);
+ }
+
+ if (BitWidth > 32) {
+ Type *VecTy = FixedVectorType::get(B.getInt32Ty(), (BitWidth + 31) / 32);
+ Data0 = B.CreateBitCast(Data0, VecTy);
+ if (Data1)
+ Data1 = B.CreateBitCast(Data1, VecTy);
+ }
+
+ Value *Result = buildLegalLaneIntrinsic(B, IID, Data0, Lane, Data1);
+
+ if ((BitWidth % 32) != 0) {
+ if (BitWidth > 32) {
+ Result = B.CreateBitCast(
+ Result, IntegerType::get(Mod->getContext(), (BitWidth + 31) / 32));
+ }
+
+ Result =
+ B.CreateTrunc(Result, IntegerType::get(Mod->getContext(), BitWidth));
+ }
+
+ return B.CreateBitOrPointerCast(Result, Ty);
+}
+
+/// "Legalize" readfirstlane/readlane/writelane to single-dword intrinsics
+/// on i32.
+///
+/// Done during codegen prepare purely because this turned out to be simpler
+/// than doing it in this generality in SelectionDAG.
+bool AMDGPUCodeGenPrepare::visitLaneIntrinsicInst(IntrinsicInst &I) {
+ Type *Ty = I.getType();
+ if (Ty->isIntegerTy(32) && Ty->getIntegerBitWidth() == 32)
+ return false; // already legal
+
+ Value *Data0 = I.getArgOperand(0);
+ Value *Lane = nullptr;
+ Value *Data1 = nullptr;
+
+ if (I.getIntrinsicID() == Intrinsic::amdgcn_readlane) {
+ Lane = I.getArgOperand(1);
+ } else if (I.getIntrinsicID() == Intrinsic::amdgcn_writelane) {
+ Lane = I.getArgOperand(1);
+ Data1 = I.getArgOperand(2);
+ }
+
+ IRBuilder<> Builder(&I);
+ Value *Legalized =
+ buildLegalLaneIntrinsic(Builder, I.getIntrinsicID(), Data0, Lane, Data1);
+
+ I.replaceAllUsesWith(Legalized);
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Mod = &M;
DL = &Mod->getDataLayout();
Index: llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -285,11 +285,11 @@
Type *const Ty = V->getType();
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty});
Function *PermLaneX16 =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty});
for (unsigned Idx = 0; Idx < 4; Idx++) {
V = buildNonAtomicBinOp(
@@ -344,11 +344,11 @@
Type *const Ty = V->getType();
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty});
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty});
Function *WriteLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {Ty});
if (ST->hasDPPWavefrontShifts()) {
// GFX9 has DPP wavefront shift operations.
@@ -490,25 +490,8 @@
// each active lane in the wavefront. This will be our new value which we
// will provide to the atomic operation.
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
- if (TyBitWidth == 64) {
- Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
- CallInst *const ReadLaneLo = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
- CallInst *const ReadLaneHi = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
- Value *const PartialInsert = B.CreateInsertElement(
- UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
- NewV = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
- {NewV, LastLaneIdx});
- } else {
- llvm_unreachable("Unhandled atomic bit width");
- }
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {Ty},
+ {NewV, LastLaneIdx});
// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
@@ -587,27 +570,8 @@
// We need to broadcast the value who was the lowest active lane (the first
// lane) to all other lanes in the wavefront. We use an intrinsic for this,
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
- Value *BroadcastI = nullptr;
-
- if (TyBitWidth == 64) {
- Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
- CallInst *const ReadFirstLaneLo =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
- CallInst *const ReadFirstLaneHi =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
- Value *const PartialInsert = B.CreateInsertElement(
- UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
- BroadcastI = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
-
- BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
- } else {
- llvm_unreachable("Unhandled atomic bit width");
- }
+ Value *BroadcastI =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {Ty}, {PHI});
// Now that we have the result of our single atomic operation, we need to
// get our individual lane's slice into the result. We use the lane offset
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1416,26 +1416,25 @@
[IntrNoMem, IntrConvergent, IntrWillReturn]>;
def int_amdgcn_readfirstlane :
- GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
- Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn]>;
// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
- GCCBuiltin<"__builtin_amdgcn_readlane">,
- Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ Intrinsic<[llvm_any_ty],
+ [LLVMMatchType<0>, // data input
+ llvm_i32_ty], // uniform lane select
[IntrNoMem, IntrConvergent, IntrWillReturn]>;
// The value to write and lane select arguments must be uniform across the
// currently active threads of the current wave. Otherwise, the result is
// undefined.
def int_amdgcn_writelane :
- GCCBuiltin<"__builtin_amdgcn_writelane">,
- Intrinsic<[llvm_i32_ty], [
- llvm_i32_ty, // uniform value to write: returned by the selected lane
- llvm_i32_ty, // uniform lane select
- llvm_i32_ty // returned by all lanes other than the selected one
+ Intrinsic<[llvm_any_ty], [
+ LLVMMatchType<0>, // uniform value to write: returned by the selected lane
+ llvm_i32_ty, // uniform lane select
+ LLVMMatchType<0> // returned by all lanes other than the selected one
],
[IntrNoMem, IntrConvergent, IntrWillReturn]
>;
Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -291,15 +291,15 @@
*out = __builtin_amdgcn_ds_bpermute(a, b);
}
-// CHECK-LABEL: @test_readfirstlane
-// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK-LABEL: @test_readfirstlane(
+// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
void test_readfirstlane(global int* out, int a)
{
*out = __builtin_amdgcn_readfirstlane(a);
}
// CHECK-LABEL: @test_readlane
-// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
void test_readlane(global int* out, int a, int b)
{
*out = __builtin_amdgcn_readlane(a, b);
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -14873,6 +14873,10 @@
}
LLVM_FALLTHROUGH;
}
+ case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+ return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane);
+ case AMDGPU::BI__builtin_amdgcn_readlane:
+ return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
default:
return nullptr;
}
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits