llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang

Author: YunQiang Su (wzssyqa)

<details>
<summary>Changes</summary>

Support auto-vectorize for fminimum_num and fmaximum_num. 
For ARM64 with SVE, scalable vector cannot support yet, and
For RISCV Vector, scalable vector works well now.

---

Patch is 33.38 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/131781.diff


4 Files Affected:

- (added) clang/test/CodeGen/fminimum-num-autovec.c (+407) 
- (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+6) 
- (modified) llvm/lib/Analysis/VectorUtils.cpp (+2) 
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+11-2) 


``````````diff
diff --git a/clang/test/CodeGen/fminimum-num-autovec.c 
b/clang/test/CodeGen/fminimum-num-autovec.c
new file mode 100644
index 0000000000000..94114b6227d27
--- /dev/null
+++ b/clang/test/CodeGen/fminimum-num-autovec.c
@@ -0,0 +1,407 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// RUN: %clang --target=aarch64-unknown-linux-gnu -march=armv8+fp16 %s -O3 
-emit-llvm -S -o - | FileCheck %s --check-prefix=ARMV8
+// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64gv_zvfh %s -O3 
-emit-llvm -S -o - | FileCheck %s --check-prefix=RV64_ZVFH
+// FIXME: SVE cannot emit VSCALE.
+
+
+float af32[4096];
+float bf32[4096];
+float cf32[4096];
+// ARMV8-LABEL: define dso_local void @f32min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @af32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, 
!tbaa [[TBAA6:![0-9]+]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @bf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <4 x float> 
@llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <4 x float> 
@llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @cf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa 
[[TBAA6]]
+// ARMV8-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa 
[[TBAA6]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f32min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @af32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr 
[[TMP2]], align 4, !tbaa [[TBAA9:![0-9]+]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @bf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr 
[[TMP3]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> 
@llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x 
float> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @cf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 
4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f32min() {
+       for (int i=0; i<4096; i++) {cf32[i] = __builtin_fminimum_numf(af32[i], 
bf32[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f32max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @af32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, 
!tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @bf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <4 x float> 
@llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <4 x float> 
@llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @cf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa 
[[TBAA6]]
+// ARMV8-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa 
[[TBAA6]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f32max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @af32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr 
[[TMP2]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @bf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr 
[[TMP3]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> 
@llvm.maximumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x 
float> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @cf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 
4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f32max() {
+       for (int i=0; i<4096; i++) {cf32[i] = __builtin_fmaximum_numf(af32[i], 
bf32[i]);}
+}
+
+double af64[4096];
+double bf64[4096];
+double cf64[4096];
+// ARMV8-LABEL: define dso_local void @f64min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @af64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 
8, !tbaa [[TBAA15:![0-9]+]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @bf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <2 x double> 
@llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <2 x double> 
@llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> 
[[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @cf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa 
[[TBAA15]]
+// ARMV8-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa 
[[TBAA15]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f64min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @af64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr 
[[TMP2]], align 8, !tbaa [[TBAA18:![0-9]+]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @bf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr 
[[TMP3]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> 
@llvm.minimumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x 
double> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @cf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], 
align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f64min() {
+       for (int i=0; i<4096; i++) {cf64[i] = __builtin_fminimum_num(af64[i], 
bf64[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f64max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @af64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @bf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <2 x double> 
@llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <2 x double> 
@llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> 
[[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @cf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa 
[[TBAA15]]
+// ARMV8-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa 
[[TBAA15]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f64max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @af64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr 
[[TMP2]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @bf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr 
[[TMP3]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> 
@llvm.maximumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x 
double> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @cf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], 
align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f64max() {
+       for (int i=0; i<4096; i++) {cf64[i] = __builtin_fmaximum_num(af64[i], 
bf64[i]);}
+}
+
+__fp16 af16[4096];
+__fp16 bf16[4096];
+__fp16 cf16[4096];
+// ARMV8-LABEL: define dso_local void @f16min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr 
@af16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2, 
!tbaa [[TBAA19:![0-9]+]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x half>, ptr [[TMP1]], align 
2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr 
@bf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x half>, ptr [[TMP2]], align 
2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x half>, ptr [[TMP3]], align 
2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <8 x half> 
@llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <8 x half> 
@llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD11]], <8 x half> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr 
@cf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <8 x half> [[TMP4]], ptr [[TMP6]], align 2, !tbaa 
[[TBAA19]]
+// ARMV8-NEXT:    store <8 x half> [[TMP5]], ptr [[TMP7]], align 2, !tbaa 
[[TBAA19]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f16min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ugt i64 
[[TMP0]], 512
+// RV64_ZVFH-NEXT:    br i1 [[MIN_ITERS_CHECK]], label 
%[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+// RV64_ZVFH:       [[VECTOR_PH]]:
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[DOTNEG:%.*]] = mul nuw nsw i64 [[TMP1]], 8184
+// RV64_ZVFH-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], 4096
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], 
ptr @af16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr 
[[TMP4]], align 2, !tbaa [[TBAA22:![0-9]+]]
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], 
ptr @bf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x half>, ptr 
[[TMP5]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> 
@llvm.minimumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> 
[[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], 
ptr @cf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 8 x half> [[TMP6]], ptr [[TMP7]], align 
2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+// RV64_ZVFH-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+// RV64_ZVFH-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+// RV64_ZVFH:       [[MIDDLE_BLOCK]]:
+// RV64_ZVFH-NEXT:    [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0
+// RV64_ZVFH-NEXT:    br i1 [[CMP_N_NOT]], label %[[FOR_BODY_PREHEADER]], 
label %[[F...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/131781
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to