[PATCH] D71767: [POC][SVE] Allow code generation for fixed length vectorised loops [Patch 2/2].

Paul Walker via Phabricator via cfe-commits Mon, 10 Aug 2020 08:13:48 -0700

paulwalker-arm updated this revision to Diff 284380.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D71767/new/

https://reviews.llvm.org/D71767

Files:
  clang/lib/Driver/ToolChains/Clang.cpp
  llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
  llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
  llvm/lib/Target/AArch64/AArch64ISelLowering.h
  llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
  llvm/lib/Target/AArch64/SVEInstrFormats.td
  llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll

Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
@@ -0,0 +1,373 @@
+; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Don't use SVE when its registers are no bigger than NEON.
+; NO_SVE-NOT: z{0-9}
+
+;
+; sext i8 -> i16
+;
+
+define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 {
+; CHECK-LABEL: sext_v16i8_v16i16:
+; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
+; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
+; CHECK-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %b = sext <16 x i8> %a to <16 x i16>
+  store <16 x i16>%b, <16 x i16>* %out
+  ret void
+}
+
+; NOTE: Extra 'add' is to prevent the extend being combined with the load.
+define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 {
+; CHECK-LABEL: sext_v32i8_v32i16:
+; VBITS_GE_512: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
+; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
+; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].h, vl32
+; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+  %a = load <32 x i8>, <32 x i8>* %in
+  %b = add <32 x i8> %a, %a
+  %c = sext <32 x i8> %b to <32 x i16>
+  store <32 x i16> %c, <32 x i16>* %out
+  ret void
+}
+
+define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 {
+; CHECK-LABEL: sext_v64i8_v64i16:
+; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
+; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
+; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].h, vl64
+; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %a = load <64 x i8>, <64 x i8>* %in
+  %b = add <64 x i8> %a, %a
+  %c = sext <64 x i8> %b to <64 x i16>
+  store <64 x i16> %c, <64 x i16>* %out
+  ret void
+}
+
+define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 {
+; CHECK-LABEL: sext_v128i8_v128i16:
+; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
+; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
+; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].h, vl128
+; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %a = load <128 x i8>, <128 x i8>* %in
+  %b = add <128 x i8> %a, %a
+  %c = sext <128 x i8> %b to <128 x i16>
+  store <128 x i16> %c, <128 x i16>* %out
+  ret void
+}
+
+;
+; sext i8 -> i32
+;
+
+define void @sext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 {
+; CHECK-LABEL: sext_v8i8_v8i32:
+; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
+; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b
+; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; CHECK-NEXT: st1w { [[A_HALFS]].s }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %b = sext <8 x i8> %a to <8 x i32>
+  store <8 x i32>%b, <8 x i32>* %out
+  ret void
+}
+
+define void @sext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 {
+; CHECK-LABEL: sext_v16i8_v16i32:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
+; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %b = sext <16 x i8> %a to <16 x i32>
+  store <16 x i32> %b, <16 x i32>* %out
+  ret void
+}
+
+define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 {
+; CHECK-LABEL: sext_v32i8_v32i32:
+; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
+; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
+; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32
+; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %a = load <32 x i8>, <32 x i8>* %in
+  %b = add <32 x i8> %a, %a
+  %c = sext <32 x i8> %b to <32 x i32>
+  store <32 x i32> %c, <32 x i32>* %out
+  ret void
+}
+
+define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 {
+; CHECK-LABEL: sext_v64i8_v64i32:
+; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
+; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
+; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64
+; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %a = load <64 x i8>, <64 x i8>* %in
+  %b = add <64 x i8> %a, %a
+  %c = sext <64 x i8> %b to <64 x i32>
+  store <64 x i32> %c, <64 x i32>* %out
+  ret void
+}
+
+;
+; sext i8 -> i64
+;
+
+; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The sign
+; extend is a two step process where the container is any_extend'd with the
+; result feeding an inreg sign extend.
+define void @sext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v4i8_v4i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
+; CHECK-NEXT: uunpklo [[ANYEXT_W:z[0-9]+]].s, z0.h
+; CHECK-NEXT: uunpklo [[ANYEXT_D:z[0-9]+]].d, [[ANYEXT_W]].s
+; CHECK-NEXT: sxtb [[A_DWORDS:z[0-9]+]].d, [[PG]]/m, [[ANYEXT_D]].d
+; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %b = sext <4 x i8> %a to <4 x i64>
+  store <4 x i64>%b, <4 x i64>* %out
+  ret void
+}
+
+define void @sext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v8i8_v8i64:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
+; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %b = sext <8 x i8> %a to <8 x i64>
+  store <8 x i64>%b, <8 x i64>* %out
+  ret void
+}
+
+define void @sext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v16i8_v16i64:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
+; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
+; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %b = sext <16 x i8> %a to <16 x i64>
+  store <16 x i64> %b, <16 x i64>* %out
+  ret void
+}
+
+define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v32i8_v32i64:
+; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b
+; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b
+; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32
+; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %a = load <32 x i8>, <32 x i8>* %in
+  %b = add <32 x i8> %a, %a
+  %c = sext <32 x i8> %b to <32 x i64>
+  store <32 x i64> %c, <32 x i64>* %out
+  ret void
+}
+
+;
+; sext i16 -> i32
+;
+
+define void @sext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 {
+; CHECK-LABEL: sext_v8i16_v8i32:
+; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
+; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
+; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %b = sext <8 x i16> %a to <8 x i32>
+  store <8 x i32>%b, <8 x i32>* %out
+  ret void
+}
+
+define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 {
+; CHECK-LABEL: sext_v16i16_v16i32:
+; VBITS_GE_512: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
+; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+  %a = load <16 x i16>, <16 x i16>* %in
+  %b = add <16 x i16> %a, %a
+  %c = sext <16 x i16> %b to <16 x i32>
+  store <16 x i32> %c, <16 x i32>* %out
+  ret void
+}
+
+define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 {
+; CHECK-LABEL: sext_v32i16_v32i32:
+; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
+; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32
+; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %a = load <32 x i16>, <32 x i16>* %in
+  %b = add <32 x i16> %a, %a
+  %c = sext <32 x i16> %b to <32 x i32>
+  store <32 x i32> %c, <32 x i32>* %out
+  ret void
+}
+
+define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 {
+; CHECK-LABEL: sext_v64i16_v64i32:
+; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
+; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64
+; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %a = load <64 x i16>, <64 x i16>* %in
+  %b = add <64 x i16> %a, %a
+  %c = sext <64 x i16> %b to <64 x i32>
+  store <64 x i32> %c, <64 x i32>* %out
+  ret void
+}
+
+;
+; sext i16 -> i64
+;
+
+define void @sext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v4i16_v4i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
+; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
+; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %b = sext <4 x i16> %a to <4 x i64>
+  store <4 x i64>%b, <4 x i64>* %out
+  ret void
+}
+
+define void @sext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v8i16_v8i64:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h
+; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %b = sext <8 x i16> %a to <8 x i64>
+  store <8 x i64>%b, <8 x i64>* %out
+  ret void
+}
+
+define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v16i16_v16i64:
+; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
+; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16
+; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %a = load <16 x i16>, <16 x i16>* %in
+  %b = add <16 x i16> %a, %a
+  %c = sext <16 x i16> %b to <16 x i64>
+  store <16 x i64> %c, <16 x i64>* %out
+  ret void
+}
+
+define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v32i16_v32i64:
+; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h
+; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h
+; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32
+; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %a = load <32 x i16>, <32 x i16>* %in
+  %b = add <32 x i16> %a, %a
+  %c = sext <32 x i16> %b to <32 x i64>
+  store <32 x i64> %c, <32 x i64>* %out
+  ret void
+}
+
+;
+; sext i32 -> i64
+;
+
+define void @sext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v4i32_v4i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
+; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, z0.s
+; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %b = sext <4 x i32> %a to <4 x i64>
+  store <4 x i64>%b, <4 x i64>* %out
+  ret void
+}
+
+define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v8i32_v8i64:
+; VBITS_GE_512: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
+; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+  %a = load <8 x i32>, <8 x i32>* %in
+  %b = add <8 x i32> %a, %a
+  %c = sext <8 x i32> %b to <8 x i64>
+  store <8 x i64> %c, <8 x i64>* %out
+  ret void
+}
+
+define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v16i32_v16i64:
+; VBITS_GE_1024: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
+; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16
+; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %a = load <16 x i32>, <16 x i32>* %in
+  %b = add <16 x i32> %a, %a
+  %c = sext <16 x i32> %b to <16 x i64>
+  store <16 x i64> %c, <16 x i64>* %out
+  ret void
+}
+
+define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 {
+; CHECK-LABEL: sext_v32i32_v32i64:
+; VBITS_GE_2048: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s
+; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s
+; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32
+; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %a = load <32 x i32>, <32 x i32>* %in
+  %b = add <32 x i32> %a, %a
+  %c = sext <32 x i32> %b to <32 x i64>
+  store <32 x i64> %c, <32 x i64>* %out
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sve" }
Index: llvm/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -404,6 +404,11 @@
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
       (inst (ptrue 31), $Op1, $Op2)>;
 
+class SVE_InReg_Extend<ValueType vt, SDPatternOperator op, ValueType pt,
+                       ValueType inreg_vt, Instruction inst>
+: Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, vt:$PassThru)),
+      (inst $PassThru, $Pg, $Src)>;
+
 //
 // Pseudo -> Instruction mappings
 //
@@ -3688,9 +3693,9 @@
   def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_InReg_Extend<nxv8i16, op, nxv8i1, nxv8i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i8, !cast<Instruction>(NAME # _S)>;
+  def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i8, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm,
@@ -3698,15 +3703,15 @@
   def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i16, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm,
                                     SDPatternOperator op> {
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i32, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -164,12 +164,12 @@
 
 def SDT_AArch64Arith : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
-  SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>
+  SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>
 ]>;
 
 def SDT_AArch64FMA : SDTypeProfile<1, 4, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>,
-  SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
+  SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
 ]>;
 
 // Predicated operations with the result of inactive lanes being unspecified.
@@ -178,6 +178,8 @@
 def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
 def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>;
 def AArch64fma_p  : SDNode<"AArch64ISD::FMA_PRED",  SDT_AArch64FMA>;
+def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>;
+def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>;
 def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>;
 def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>;
 def AArch64lsl_p  : SDNode<"AArch64ISD::SHL_PRED",  SDT_AArch64Arith>;
@@ -186,10 +188,20 @@
 def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
 def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
 def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
+def AArch64sub_p  : SDNode<"AArch64ISD::SUB_PRED",  SDT_AArch64Arith>;
 def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
 def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
 def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
 
+def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisVTSmallerThanOp<3, 2>, SDTCisSameAs<0,4>
+]>;
+
+// Predicated operations with the result of inactive lanes provided by the last operand.
+def AArch64sxt_mt : SDNode<"AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>;
+def AArch64uxt_mt : SDNode<"AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>;
+
 def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
 def AArch64clasta_n   : SDNode<"AArch64ISD::CLASTA_N",   SDT_AArch64ReduceWithInit>;
 def AArch64clastb_n   : SDNode<"AArch64ISD::CLASTB_N",   SDT_AArch64ReduceWithInit>;
@@ -246,6 +258,7 @@
   defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
 
   defm ADD_ZPZZ  : sve_int_bin_pred_bhsd<AArch64add_p>;
+  defm SUB_ZPZZ  : sve_int_bin_pred_bhsd<AArch64sub_p>;
 
   let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
     defm ADD_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
@@ -312,12 +325,12 @@
   defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
   defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
 
-  defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", int_aarch64_sve_sxtb>;
-  defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", int_aarch64_sve_uxtb>;
-  defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", int_aarch64_sve_sxth>;
-  defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", int_aarch64_sve_uxth>;
-  defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", int_aarch64_sve_sxtw>;
-  defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", int_aarch64_sve_uxtw>;
+  defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", AArch64sxt_mt>;
+  defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", AArch64uxt_mt>;
+  defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", AArch64sxt_mt>;
+  defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", AArch64uxt_mt>;
+  defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", AArch64sxt_mt>;
+  defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", AArch64uxt_mt>;
   defm ABS_ZPmZ  : sve_int_un_pred_arit_0<  0b110, "abs",  int_aarch64_sve_abs>;
   defm NEG_ZPmZ  : sve_int_un_pred_arit_0<  0b111, "neg",  int_aarch64_sve_neg>;
 
@@ -375,6 +388,8 @@
   defm FADD_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fadd_p>;
   defm FSUB_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fsub_p>;
   defm FMUL_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fmul_p>;
+  defm FMAXNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmaxnm_p>;
+  defm FMINNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fminnm_p>;
   defm FDIV_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fdiv_p>;
 
   let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -77,6 +77,8 @@
   FADD_PRED,
   FDIV_PRED,
   FMA_PRED,
+  FMAXNM_PRED,
+  FMINNM_PRED,
   FMUL_PRED,
   FSUB_PRED,
   MUL_PRED,
@@ -86,10 +88,16 @@
   SMIN_PRED,
   SRA_PRED,
   SRL_PRED,
+  SUB_PRED,
   UDIV_PRED,
   UMAX_PRED,
   UMIN_PRED,
 
+  // Predicated instructions with the result of inactive lanes provided by the
+  // last operand.
+  SIGN_EXTEND_INREG_MERGE_PASSTHRU,
+  ZERO_EXTEND_INREG_MERGE_PASSTHRU,
+
   SETCC_MERGE_ZERO,
 
   // Arithmetic instructions which write flags.
@@ -870,6 +878,7 @@
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
@@ -896,7 +905,10 @@
   SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
                              EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;
 
+  SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op,
+                                               SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op,
                                               SelectionDAG &DAG) const;
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -128,6 +128,19 @@
          VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
 }
 
+// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
+// predicate and end with a passthru value matching the result type.
+static bool isMergePassthruOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64ISD::DUP_MERGE_PASSTHRU:
+  case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
+  case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
+    return true;
+  }
+}
+
 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                                              const AArch64Subtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -1082,17 +1095,39 @@
   // Lower fixed length vector operations to scalable equivalents.
   setOperationAction(ISD::ADD, VT, Custom);
   setOperationAction(ISD::AND, VT, Custom);
+  setOperationAction(ISD::ANY_EXTEND, VT, Custom);
   setOperationAction(ISD::FADD, VT, Custom);
   setOperationAction(ISD::FDIV, VT, Custom);
   setOperationAction(ISD::FMA, VT, Custom);
+  setOperationAction(ISD::FMAXNUM, VT, Custom);
+  setOperationAction(ISD::FMINNUM, VT, Custom);
   setOperationAction(ISD::FMUL, VT, Custom);
   setOperationAction(ISD::FSUB, VT, Custom);
   setOperationAction(ISD::LOAD, VT, Custom);
   setOperationAction(ISD::MUL, VT, Custom);
   setOperationAction(ISD::OR, VT, Custom);
+  setOperationAction(ISD::SETCC, VT, Custom);
+  setOperationAction(ISD::SHL, VT, Custom);
+  setOperationAction(ISD::SMAX, VT, Custom);
+  setOperationAction(ISD::SMIN, VT, Custom);
+  setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
+  setOperationAction(ISD::SRA, VT, Custom);
+  setOperationAction(ISD::SRL, VT, Custom);
   setOperationAction(ISD::STORE, VT, Custom);
+  setOperationAction(ISD::SUB, VT, Custom);
   setOperationAction(ISD::TRUNCATE, VT, Custom);
+  setOperationAction(ISD::UMAX, VT, Custom);
+  setOperationAction(ISD::UMIN, VT, Custom);
+  setOperationAction(ISD::VSELECT, VT, Custom);
   setOperationAction(ISD::XOR, VT, Custom);
+  setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+
+  if (VT.getVectorElementType() == MVT::i32 ||
+      VT.getVectorElementType() == MVT::i64) {
+    setOperationAction(ISD::SDIV, VT, Custom);
+    setOperationAction(ISD::UDIV, VT, Custom);
+  }
 }
 
 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
@@ -1412,9 +1447,12 @@
     MAKE_CASE(AArch64ISD::SMIN_PRED)
     MAKE_CASE(AArch64ISD::SRA_PRED)
     MAKE_CASE(AArch64ISD::SRL_PRED)
+    MAKE_CASE(AArch64ISD::SUB_PRED)
     MAKE_CASE(AArch64ISD::UDIV_PRED)
     MAKE_CASE(AArch64ISD::UMAX_PRED)
     MAKE_CASE(AArch64ISD::UMIN_PRED)
+    MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::ADC)
     MAKE_CASE(AArch64ISD::SBC)
@@ -1509,8 +1547,10 @@
     MAKE_CASE(AArch64ISD::FDIV_PRED)
     MAKE_CASE(AArch64ISD::FMA_PRED)
     MAKE_CASE(AArch64ISD::FMAXV_PRED)
+    MAKE_CASE(AArch64ISD::FMAXNM_PRED)
     MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
     MAKE_CASE(AArch64ISD::FMINV_PRED)
+    MAKE_CASE(AArch64ISD::FMINNM_PRED)
     MAKE_CASE(AArch64ISD::FMINNMV_PRED)
     MAKE_CASE(AArch64ISD::FMUL_PRED)
     MAKE_CASE(AArch64ISD::FSUB_PRED)
@@ -3297,6 +3337,43 @@
                        Op.getOperand(1), Scalar);
   }
 
+  case Intrinsic::aarch64_sve_sxtb:
+    return DAG.getNode(
+        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_sxth:
+    return DAG.getNode(
+        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_sxtw:
+    return DAG.getNode(
+        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_uxtb:
+    return DAG.getNode(
+        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_uxth:
+    return DAG.getNode(
+        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_uxtw:
+    return DAG.getNode(
+        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
+        Op.getOperand(1));
+
   case Intrinsic::localaddress: {
     const auto &MF = DAG.getMachineFunction();
     const auto *RegInfo = Subtarget->getRegisterInfo();
@@ -3600,6 +3677,8 @@
   case ISD::FLT_ROUNDS_:
     return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
     return LowerMUL(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
@@ -3621,6 +3700,13 @@
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::VSCALE:
     return LowerVSCALE(Op, DAG);
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
+  case ISD::SIGN_EXTEND_INREG:
+    return LowerToPredicatedOp(Op, DAG,
+                               AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
   case ISD::TRUNCATE:
     return LowerTRUNCATE(Op, DAG);
   case ISD::LOAD:
@@ -3633,6 +3719,20 @@
     llvm_unreachable("Unexpected request to lower ISD::ADD");
   case ISD::AND:
     return LowerToScalableOp(Op, DAG);
+  case ISD::SUB:
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
+    llvm_unreachable("Unexpected request to lower ISD::SUB");
+  case ISD::FMAXNUM:
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
+    llvm_unreachable("Unexpected request to lower ISD::FMAXNUM");
+  case ISD::FMINNUM:
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
+    llvm_unreachable("Unexpected request to lower ISD::FMINNUM");
+  case ISD::VSELECT:
+    return LowerVSELECT(Op, DAG);
   }
 }
 
@@ -8979,7 +9079,7 @@
     llvm_unreachable("unexpected shift opcode");
 
   case ISD::SHL:
-    if (VT.isScalableVector())
+    if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
 
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
@@ -8991,7 +9091,7 @@
                        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
-    if (VT.isScalableVector()) {
+    if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
       unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
                                                 : AArch64ISD::SRL_PRED;
       return LowerToPredicatedOp(Op, DAG, Opc);
@@ -9125,6 +9225,9 @@
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
   }
 
+  if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
+    return LowerFixedLengthVectorSetccToSVE(Op, DAG);
+
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -11107,6 +11210,10 @@
   if (VT.isScalableVector())
     return performSVEAndCombine(N, DCI);
 
+  // TODO: useSVEForFixedLengthVectorVT?
+  if (VT.getSizeInBits() > 128)
+    return SDValue();
+
   BuildVectorSDNode *BVN =
       dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
   if (!BVN)
@@ -15229,6 +15336,42 @@
       Store->isTruncatingStore());
 }
 
+SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+  SDLoc DL(Op);
+  SDValue Val = Op.getOperand(0);
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
+  Val = convertToScalableVector(DAG, ContainerVT, Val);
+
+  bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
+  unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
+
+  // Repeatedly unpack Val until the result is of the desired element type.
+  switch (ContainerVT.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("unimplemented container type");
+  case MVT::nxv16i8:
+    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
+    if (VT.getVectorElementType() == MVT::i16)
+      break;
+    LLVM_FALLTHROUGH;
+  case MVT::nxv8i16:
+    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
+    if (VT.getVectorElementType() == MVT::i32)
+      break;
+    LLVM_FALLTHROUGH;
+  case MVT::nxv4i32:
+    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
+    assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
+    break;
+  }
+
+  return convertFromScalableVector(DAG, VT, Val);
+}
+
 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
     SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -15267,6 +15410,7 @@
 
 // Convert vector operation 'Op' to an equivalent predicated operation whereby
 // the original operation's type is used to construct a suitable predicate.
+// NOTE: The results for inactive lanes are undefined.
 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
                                                    SelectionDAG &DAG,
                                                    unsigned NewOp,
@@ -15286,11 +15430,21 @@
         continue;
       }
 
+      if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
+        EVT VTArg = VTNode->getVT().getVectorElementType();
+        EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
+        Operands.push_back(DAG.getValueType(NewVTArg));
+        continue;
+      }
+
       assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
              "Only fixed length vectors are supported!");
       Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
     }
 
+    if (isMergePassthruOpcode(NewOp))
+      Operands.push_back(DAG.getUNDEF(ContainerVT));
+
     auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
     return convertFromScalableVector(DAG, VT, ScalableRes);
   }
@@ -15304,6 +15458,9 @@
     Operands.push_back(V);
   }
 
+  if (isMergePassthruOpcode(NewOp))
+    Operands.push_back(DAG.getUNDEF(VT));
+
   return DAG.getNode(NewOp, DL, VT, Operands);
 }
 
@@ -15328,3 +15485,47 @@
   auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
   return convertFromScalableVector(DAG, VT, ScalableRes);
 }
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT InVT = Op.getOperand(0).getValueType();
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+
+  // Expand floating point vector comparisons.
+  if (InVT.isFloatingPoint())
+    return SDValue();
+
+  auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+  auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
+  auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
+
+  EVT CmpVT = Pg.getValueType();
+  SmallVector<SDValue, 4> CmpOps = {Pg, Op1, Op2, Op.getOperand(2)};
+  auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, CmpOps);
+
+  auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, ContainerVT, InVT);
+  auto Extract = convertFromScalableVector(DAG, InVT, Promote);
+  return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Extract);
+}
+
+SDValue AArch64TargetLowering::LowerVSELECT(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  EVT InVT = Op.getOperand(1).getValueType();
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+  auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
+  auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(2));
+
+  // Convert the mask to a predicated (NOTE: We don't need to worry about
+  // inactive lanes since VSELECT is safe when given undefined elements).
+  EVT MaskVT = Op.getOperand(0).getValueType();
+  EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
+  auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
+  Mask = DAG.getNode(ISD::TRUNCATE, DL,
+                     MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
+
+  auto VSel = DAG.getNode(ISD::VSELECT, DL, ContainerVT, Mask, Op1, Op2);
+  return convertFromScalableVector(DAG, InVT, VSel);
+}
Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20566,7 +20566,8 @@
     EVT N0SrcSVT = N0Src.getValueType().getScalarType();
     EVT N1SrcSVT = N1Src.getValueType().getScalarType();
     if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) &&
-        N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) {
+        N0Src.getValueType().isFixedLengthVector() &&
+        N1Src.getValueType().isFixedLengthVector()) {
       EVT NewVT;
       SDLoc DL(N);
       SDValue NewIdx;
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -1721,9 +1721,17 @@
     StringRef Val = A->getValue();
     const Driver &D = getToolChain().getDriver();
     if (Val.equals("128") || Val.equals("256") || Val.equals("512") ||
-        Val.equals("1024") || Val.equals("2048"))
+        Val.equals("1024") || Val.equals("2048")) {
       CmdArgs.push_back(
           Args.MakeArgString(llvm::Twine("-msve-vector-bits=") + Val));
+
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back(
+            Args.MakeArgString("-aarch64-sve-vector-bits-min=" + Val));
+      // CmdArgs.push_back("-mllvm");
+      // CmdArgs.push_back(
+      //    Args.MakeArgString("-aarch64-sve-vector-bits-max=" + Val));
+    }
     // Silently drop requests for vector-length agnostic code as it's implied.
     else if (!Val.equals("scalable"))
       // Handle the unsupported values passed to msve-vector-bits.

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D71767: [POC][SVE] Allow code generation for fixed length vectorised loops [Patch 2/2].

Reply via email to