[PATCH] D86154: AMDGPU: Add llvm.amdgcn.{read,readfirst,write}lane2 intrinsics with type overloads

Nicolai Hähnle via Phabricator via cfe-commits Thu, 20 Aug 2020 13:45:22 -0700

nhaehnle updated this revision to Diff 286898.
nhaehnle added a comment.

Don't duplicate the intrinsics. Rely on D86317 
<https://reviews.llvm.org/D86317> to reduce the pain of this
change caused to downstream users.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D86154/new/

https://reviews.llvm.org/D86154

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
  llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
  llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
  llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
  llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

Index: llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
===================================================================
--- llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2507,8 +2507,8 @@
 
 define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    store volatile i32 [[VAR]], i32* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    store volatile i32 [[TMP1]], i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4
@@ -2530,8 +2530,8 @@
 
 define i32 @readfirstlane_idempotent(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
   %read1 = call i32 @llvm.amdgcn.readfirstlane(i32 %read0)
@@ -2541,8 +2541,8 @@
 
 define i32 @readfirstlane_readlane(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0)
@@ -2552,11 +2552,11 @@
 define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2570,11 +2570,11 @@
 define i32 @readfirstlane_readlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 0)
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 0)
@@ -2585,6 +2585,41 @@
   ret i32 %read1
 }
 
+define i32 @readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: @readfirstlane_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  ret i32 %read
+}
+
+define float @bitcast_readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: @bitcast_readfirstlane_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  %cast.read = bitcast i32 %read to float
+  ret float %cast.read
+}
+
+define i32 @readfirstlane_bitcast_multi_use(float %arg) {
+; CHECK-LABEL: @readfirstlane_bitcast_multi_use(
+; CHECK-NEXT:    store float [[ARG:%.*]], float* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  store i32 %bitcast.arg, i32* undef
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  ret i32 %read
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.readlane
 ; --------------------------------------------------------------------
@@ -2593,8 +2628,8 @@
 
 define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7)
-; CHECK-NEXT:    store volatile i32 [[VAR]], i32* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 7)
+; CHECK-NEXT:    store volatile i32 [[TMP1]], i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4
@@ -2616,8 +2651,8 @@
 
 define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane)
@@ -2626,9 +2661,9 @@
 
 define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
 ; CHECK-LABEL: @readlane_idempotent_different_lanes(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP1]], i32 [[LANE1:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1)
@@ -2637,8 +2672,8 @@
 
 define i32 @readlane_readfirstlane(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0)
@@ -2648,11 +2683,11 @@
 define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP0]], i32 [[LANE]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
@@ -2667,11 +2702,11 @@
 define i32 @readlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP0]], i32 0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -3,6 +3,11 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+declare i8 addrspace(3)* @llvm.amdgcn.writelane.p3i8(i8 addrspace(3)*, i32, i8 addrspace(3)*) #0
+declare i16 @llvm.amdgcn.writelane.i16(i16, i32, i16) #0
+declare half @llvm.amdgcn.writelane.f16(half, i32, half) #0
+declare <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16>, i32, <3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float>, i32, <9 x float>) #0
 
 ; CHECK-LABEL: {{^}}test_writelane_sreg:
 ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
@@ -79,6 +84,60 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_writelane_p3:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+  %writelane = call i8 addrspace(3)* @llvm.amdgcn.writelane.p3i8(i8 addrspace(3)* null, i32 15, i8 addrspace(3)* %src)
+  store i8 addrspace(3)* %writelane, i8 addrspace(3)* addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_i16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_i16(i16 addrspace(1)* %out, i16 %src) {
+  %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 1234, i32 15, i16 %src)
+  store i16 %writelane, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_f16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_f16(half addrspace(1)* %out, half %src) {
+  %writelane = call half @llvm.amdgcn.writelane.f16(half 1.0, i32 15, half %src)
+  store half %writelane, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_v3i16:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+  %writelane = call <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16> zeroinitializer, i32 15, <3 x i16> %src)
+  store <3 x i16> %writelane, <3 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_v9f32:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %writelane = call <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float> zeroinitializer, i32 15, <9 x float> %src)
+  store <9 x float> %writelane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,6 +1,11 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readlane(i32, i32) #0
+declare i8 addrspace(3)* @llvm.amdgcn.readlane.p3i8(i8 addrspace(3)*, i32) #0
+declare i16 @llvm.amdgcn.readlane.i16(i16, i32) #0
+declare half @llvm.amdgcn.readlane.f16(half, i32) #0
+declare <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16>, i32) #0
+declare <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float>, i32) #0
 
 ; CHECK-LABEL: {{^}}test_readlane_sreg_sreg:
 ; CHECK-NOT: v_readlane_b32
@@ -77,6 +82,60 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readlane_p3:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+  %readlane = call i8 addrspace(3)* @llvm.amdgcn.readlane.p3i8(i8 addrspace(3)* %src, i32 15)
+  store i8 addrspace(3)* %readlane, i8 addrspace(3)* addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_i16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_i16(i16 addrspace(1)* %out, i16 %src) {
+  %readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 15)
+  store i16 %readlane, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_f16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_f16(half addrspace(1)* %out, half %src) {
+  %readlane = call half @llvm.amdgcn.readlane.f16(half %src, i32 15)
+  store half %readlane, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v3i16:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+  %readlane = call <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16> %src, i32 15)
+  store <3 x i16> %readlane, <3 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v9f32:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v3, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v4, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v5, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v6, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v7, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v8, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v9, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v10, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %readlane = call <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float> %src, i32 15)
+  store <9 x float> %readlane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,11 +1,20 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
-declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32) #0
+declare float @llvm.amdgcn.readfirstlane.f32(float) #0
+declare <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half>) #0
+declare <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16>) #0
+declare i8 addrspace(3)* @llvm.amdgcn.readfirstlane.p3i8(i8 addrspace(3)*) #0
+declare i16 @llvm.amdgcn.readfirstlane.i16(i16) #0
+declare half @llvm.amdgcn.readfirstlane.f16(half) #0
+declare <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float>) #0
 
-; CHECK-LABEL: {{^}}test_readfirstlane:
+; CHECK-LABEL: {{^}}test_readfirstlane_i32:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+define void @test_readfirstlane_i32(i32 addrspace(1)* %out, i32 %src) #1 {
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -15,7 +24,7 @@
 ; CHECK-NOT: [[SGPR_VAL]]
 ; CHECK: ; use [[SGPR_VAL]]
 define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
@@ -25,7 +34,7 @@
 ; CHECK-NOT: [[VVAL]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_imm_fold(i32 addrspace(1)* %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -36,7 +45,7 @@
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %m0)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -51,7 +60,7 @@
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 {
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -62,10 +71,91 @@
 define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint i32 addrspace(5)* %alloca to i32
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %int)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readfirstlane_f32:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f32(float addrspace(1)* %out, float %src) #1 {
+  %readfirstlane = call float @llvm.amdgcn.readfirstlane.f32(float %src)
+  store float %readfirstlane, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v2f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %src) #1 {
+  %readfirstlane = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src)
+  store <2 x half> %readfirstlane, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v2i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %src) #1 {
+  %readfirstlane = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src)
+  store <2 x i16> %readfirstlane, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_p3:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+  %readfirstlane = call i8 addrspace(3)* @llvm.amdgcn.readfirstlane.p3i8(i8 addrspace(3)* %src)
+  store i8 addrspace(3)* %readfirstlane, i8 addrspace(3)* addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_i16(i16 addrspace(1)* %out, i16 %src) {
+  %readfirstlane = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src)
+  store i16 %readfirstlane, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f16(half addrspace(1)* %out, half %src) {
+  %readfirstlane = call half @llvm.amdgcn.readfirstlane.f16(half %src)
+  store half %readfirstlane, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v3i16:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+  %readfirstlane = call <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16> %src)
+  store <3 x i16> %readfirstlane, <3 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v9f32:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v3
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v4
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v5
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v6
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v7
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v8
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v9
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v10
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %readfirstlane = call <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float> %src)
+  store <9 x float> %readfirstlane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 attributes #0 = { nounwind readnone convergent }
 attributes #1 = { nounwind }
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
@@ -1,11 +1,11 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel.*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s
 # RUN: FileCheck -check-prefix=ERR %s < %t
 
-# ERR: remark: <unknown>:0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s)
+# ERR: remark: <unknown>:0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s32_s)
 
 ---
-name: readfirstlane_v
+name: readfirstlane_s32_v
 legalized: true
 regBankSelected: true
 tracksRegLiveness: true
@@ -13,7 +13,7 @@
 body: |
   bb.0:
     liveins: $vgpr0
-    ; GCN-LABEL: name: readfirstlane_v
+    ; GCN-LABEL: name: readfirstlane_s32_v
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
@@ -24,7 +24,7 @@
 ...
 
 ---
-name: readfirstlane_v_imm
+name: readfirstlane_v_s32_imm
 legalized: true
 regBankSelected: true
 tracksRegLiveness: true
@@ -32,7 +32,7 @@
 body: |
   bb.0:
 
-    ; GCN-LABEL: name: readfirstlane_v_imm
+    ; GCN-LABEL: name: readfirstlane_v_s32_imm
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 123, implicit $exec
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_]]
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY]]
@@ -44,7 +44,7 @@
 
 # Make sure this fails to select
 ---
-name: readfirstlane_s
+name: readfirstlane_s32_s
 legalized: true
 regBankSelected: true
 tracksRegLiveness: true
@@ -52,7 +52,7 @@
 body: |
   bb.0:
     liveins: $sgpr0
-    ; GCN-LABEL: name: readfirstlane_s
+    ; GCN-LABEL: name: readfirstlane_s32_s
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
Index: llvm/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2203,7 +2203,7 @@
 // FIXME: Should also do this for readlane, but tablegen crashes on
 // the ignored src1.
 def : GCNPat<
-  (int_amdgcn_readfirstlane (i32 imm:$src)),
+  (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
   (S_MOV_B32 SReg_32:$src)
 >;
 
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -715,6 +715,8 @@
 
   addPass(createAtomicExpandPass());
 
+  if (EnableAtomicOptimizations)
+    addPass(createAMDGPUAtomicOptimizerPass());
 
   addPass(createAMDGPULowerIntrinsicsPass());
 
@@ -871,10 +873,6 @@
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
-  if (EnableAtomicOptimizations) {
-    addPass(createAMDGPUAtomicOptimizerPass());
-  }
-
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
 
Index: llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -655,6 +655,17 @@
       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
         return IC.replaceInstUsesWith(II, Src);
       }
+
+      // readfirstlane (bitcast x) -> bitcast (readfirstlane x)
+      Value *BitcastInput = nullptr;
+      if (match(Src,
+                PatternMatch::m_BitCast(PatternMatch::m_Value(BitcastInput)))) {
+        CallInst *NewCall =
+            IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+                                       {BitcastInput->getType()}, BitcastInput);
+        Value *NewCast = IC.Builder.CreateBitCast(NewCall, II.getType());
+        return IC.replaceInstUsesWith(II, NewCast);
+      }
     } else {
       // readlane (readlane x, y), y -> readlane x, y
       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -224,6 +224,10 @@
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+  bool visitLaneIntrinsicInst(IntrinsicInst &I);
+  Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID,
+                                 Value *Data0, Value *Lane = nullptr,
+                                 Value *Data1 = nullptr);
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -1344,6 +1348,10 @@
   switch (I.getIntrinsicID()) {
   case Intrinsic::bitreverse:
     return visitBitreverseIntrinsicInst(I);
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readlane:
+  case Intrinsic::amdgcn_writelane:
+    return visitLaneIntrinsicInst(I);
   default:
     return false;
   }
@@ -1359,6 +1367,138 @@
   return Changed;
 }
 
+Value *AMDGPUCodeGenPrepare::buildLegalLaneIntrinsic(IRBuilder<> &B,
+                                                     Intrinsic::ID IID,
+                                                     Value *Data0, Value *Lane,
+                                                     Value *Data1) {
+  Type *Ty = Data0->getType();
+
+  if (Ty == B.getInt32Ty()) {
+    Value *Args[3] = {Data0, Lane, Data1};
+    unsigned NumArgs = Data1 != nullptr ? 3 : Lane != nullptr ? 2 : 1;
+    return B.CreateIntrinsic(IID, {B.getInt32Ty()}, {Args, NumArgs});
+  }
+
+  if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+    Type *EltType = VecTy->getElementType();
+    bool is16Bit =
+        (EltType->isIntegerTy() && EltType->getIntegerBitWidth() == 16) ||
+        (EltType->isHalfTy());
+    int EC = VecTy->getElementCount().Min;
+
+    Value *Result = UndefValue::get(Ty);
+    for (int i = 0; i < EC; i += 1 + is16Bit) {
+      Value *EltData0;
+      Value *EltData1 = nullptr;
+
+      if (is16Bit) {
+        int Idxs[2] = {i, i + 1};
+        EltData0 = B.CreateShuffleVector(Data0, UndefValue::get(Ty), Idxs);
+        EltData0 = B.CreateBitCast(EltData0, B.getInt32Ty());
+      } else {
+        EltData0 = B.CreateExtractElement(Data0, i);
+      }
+
+      if (Data1) {
+        if (is16Bit) {
+          int Idxs[2] = {i, i + 1};
+          EltData1 = B.CreateShuffleVector(Data1, UndefValue::get(Ty), Idxs);
+          EltData1 = B.CreateBitCast(EltData1, B.getInt32Ty());
+        } else {
+          EltData1 = B.CreateExtractElement(Data1, i);
+        }
+      }
+
+      Value *EltResult =
+          buildLegalLaneIntrinsic(B, IID, EltData0, Lane, EltData1);
+
+      if (is16Bit) {
+        EltResult =
+            B.CreateBitCast(EltResult, FixedVectorType::get(EltType, 2));
+        for (int j = 0; j < 2; ++j) {
+          if (i + j >= EC)
+            break;
+          Result = B.CreateInsertElement(
+              Result, B.CreateExtractElement(EltResult, j), i + j);
+        }
+      } else {
+        Result = B.CreateInsertElement(Result, EltResult, i);
+      }
+    }
+
+    return Result;
+  }
+
+  unsigned BitWidth = DL->getTypeSizeInBits(Ty);
+  Type *IntTy = Ty;
+
+  if (!Ty->isIntegerTy()) {
+    IntTy = IntegerType::get(Mod->getContext(), BitWidth);
+    Data0 = B.CreateBitOrPointerCast(Data0, IntTy);
+    if (Data1)
+      Data1 = B.CreateBitOrPointerCast(Data1, IntTy);
+  }
+
+  if ((BitWidth % 32) != 0) {
+    Type *ExtendedTy =
+        IntegerType::get(Mod->getContext(), (BitWidth + 31) & ~31);
+    Data0 = B.CreateZExt(Data0, ExtendedTy);
+    if (Data1)
+      Data1 = B.CreateZExt(Data1, ExtendedTy);
+  }
+
+  if (BitWidth > 32) {
+    Type *VecTy = FixedVectorType::get(B.getInt32Ty(), (BitWidth + 31) / 32);
+    Data0 = B.CreateBitCast(Data0, VecTy);
+    if (Data1)
+      Data1 = B.CreateBitCast(Data1, VecTy);
+  }
+
+  Value *Result = buildLegalLaneIntrinsic(B, IID, Data0, Lane, Data1);
+
+  if ((BitWidth % 32) != 0) {
+    if (BitWidth > 32) {
+      Result = B.CreateBitCast(
+          Result, IntegerType::get(Mod->getContext(), (BitWidth + 31) / 32));
+    }
+
+    Result =
+        B.CreateTrunc(Result, IntegerType::get(Mod->getContext(), BitWidth));
+  }
+
+  return B.CreateBitOrPointerCast(Result, Ty);
+}
+
+/// "Legalize" readfirstlane/readlane/writelane to single-dword intrinsics
+/// on i32.
+///
+/// Done during codegen prepare purely because this turned out to be simpler
+/// than doing it in this generality in SelectionDAG.
+bool AMDGPUCodeGenPrepare::visitLaneIntrinsicInst(IntrinsicInst &I) {
+  Type *Ty = I.getType();
+  if (Ty->isIntegerTy(32) && Ty->getIntegerBitWidth() == 32)
+    return false; // already legal
+
+  Value *Data0 = I.getArgOperand(0);
+  Value *Lane = nullptr;
+  Value *Data1 = nullptr;
+
+  if (I.getIntrinsicID() == Intrinsic::amdgcn_readlane) {
+    Lane = I.getArgOperand(1);
+  } else if (I.getIntrinsicID() == Intrinsic::amdgcn_writelane) {
+    Lane = I.getArgOperand(1);
+    Data1 = I.getArgOperand(2);
+  }
+
+  IRBuilder<> Builder(&I);
+  Value *Legalized =
+      buildLegalLaneIntrinsic(Builder, I.getIntrinsicID(), Data0, Lane, Data1);
+
+  I.replaceAllUsesWith(Legalized);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
   Mod = &M;
   DL = &Mod->getDataLayout();
Index: llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -285,11 +285,11 @@
   Type *const Ty = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty});
   Function *PermLaneX16 =
       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty});
 
   for (unsigned Idx = 0; Idx < 4; Idx++) {
     V = buildNonAtomicBinOp(
@@ -344,11 +344,11 @@
   Type *const Ty = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty});
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty});
   Function *WriteLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {Ty});
 
   if (ST->hasDPPWavefrontShifts()) {
     // GFX9 has DPP wavefront shift operations.
@@ -490,25 +490,8 @@
     // each active lane in the wavefront. This will be our new value which we
     // will provide to the atomic operation.
     Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-    if (TyBitWidth == 64) {
-      Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
-      Value *const ExtractHi =
-          B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
-      CallInst *const ReadLaneLo = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
-      CallInst *const ReadLaneHi = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
-      Value *const PartialInsert = B.CreateInsertElement(
-          UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
-      Value *const Insert =
-          B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
-      NewV = B.CreateBitCast(Insert, Ty);
-    } else if (TyBitWidth == 32) {
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
-                               {NewV, LastLaneIdx});
-    } else {
-      llvm_unreachable("Unhandled atomic bit width");
-    }
+    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {Ty},
+                             {NewV, LastLaneIdx});
 
     // Finally mark the readlanes in the WWM section.
     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
@@ -587,27 +570,8 @@
     // We need to broadcast the value who was the lowest active lane (the first
     // lane) to all other lanes in the wavefront. We use an intrinsic for this,
     // but have to handle 64-bit broadcasts with two calls to this intrinsic.
-    Value *BroadcastI = nullptr;
-
-    if (TyBitWidth == 64) {
-      Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
-      Value *const ExtractHi =
-          B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
-      CallInst *const ReadFirstLaneLo =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
-      CallInst *const ReadFirstLaneHi =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
-      Value *const PartialInsert = B.CreateInsertElement(
-          UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
-      Value *const Insert =
-          B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
-      BroadcastI = B.CreateBitCast(Insert, Ty);
-    } else if (TyBitWidth == 32) {
-
-      BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
-    } else {
-      llvm_unreachable("Unhandled atomic bit width");
-    }
+    Value *BroadcastI =
+        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {Ty}, {PHI});
 
     // Now that we have the result of our single atomic operation, we need to
     // get our individual lane's slice into the result. We use the lane offset
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1416,26 +1416,25 @@
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 def int_amdgcn_readfirstlane :
-  GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :
-  GCCBuiltin<"__builtin_amdgcn_readlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>,  // data input
+             llvm_i32_ty],      // uniform lane select
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // The value to write and lane select arguments must be uniform across the
 // currently active threads of the current wave. Otherwise, the result is
 // undefined.
 def int_amdgcn_writelane :
-  GCCBuiltin<"__builtin_amdgcn_writelane">,
-  Intrinsic<[llvm_i32_ty], [
-    llvm_i32_ty,    // uniform value to write: returned by the selected lane
-    llvm_i32_ty,    // uniform lane select
-    llvm_i32_ty     // returned by all lanes other than the selected one
+  Intrinsic<[llvm_any_ty], [
+    LLVMMatchType<0>,  // uniform value to write: returned by the selected lane
+    llvm_i32_ty,       // uniform lane select
+    LLVMMatchType<0>   // returned by all lanes other than the selected one
   ],
   [IntrNoMem, IntrConvergent, IntrWillReturn]
 >;
Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -291,15 +291,15 @@
   *out = __builtin_amdgcn_ds_bpermute(a, b);
 }
 
-// CHECK-LABEL: @test_readfirstlane
-// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK-LABEL: @test_readfirstlane(
+// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
 void test_readfirstlane(global int* out, int a)
 {
   *out = __builtin_amdgcn_readfirstlane(a);
 }
 
 // CHECK-LABEL: @test_readlane
-// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
 void test_readlane(global int* out, int a, int b)
 {
   *out = __builtin_amdgcn_readlane(a, b);
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -14873,6 +14873,10 @@
     }
     LLVM_FALLTHROUGH;
   }
+  case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane);
+  case AMDGPU::BI__builtin_amdgcn_readlane:
+    return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
   default:
     return nullptr;
   }

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D86154: AMDGPU: Add llvm.amdgcn.{read,readfirst,write}lane2 intrinsics with type overloads

Reply via email to