https://github.com/wzssyqa created 
https://github.com/llvm/llvm-project/pull/131781

Support auto-vectorize for fminimum_num and fmaximum_num. 
For ARM64 with SVE, scalable vector cannot support yet, and
For RISCV Vector, scalable vector works well now.

>From e367aaa410dcbb6f3d1c5803eac49dde6dae25c5 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqi...@isrc.iscas.ac.cn>
Date: Tue, 18 Mar 2025 18:46:29 +0800
Subject: [PATCH] Vectorize: Support fminimumnum and fmaximumnum

Support auto-vectorize for fminimum_num and fmaximum_num.
For ARM64 with SVE, scalable vector cannot support yet, and
For RISCV Vector, scalable vector works well now.
---
 clang/test/CodeGen/fminimum-num-autovec.c   | 407 ++++++++++++++++++++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h    |   6 +
 llvm/lib/Analysis/VectorUtils.cpp           |   2 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  13 +-
 4 files changed, 426 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/fminimum-num-autovec.c

diff --git a/clang/test/CodeGen/fminimum-num-autovec.c 
b/clang/test/CodeGen/fminimum-num-autovec.c
new file mode 100644
index 0000000000000..94114b6227d27
--- /dev/null
+++ b/clang/test/CodeGen/fminimum-num-autovec.c
@@ -0,0 +1,407 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// RUN: %clang --target=aarch64-unknown-linux-gnu -march=armv8+fp16 %s -O3 
-emit-llvm -S -o - | FileCheck %s --check-prefix=ARMV8
+// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64gv_zvfh %s -O3 
-emit-llvm -S -o - | FileCheck %s --check-prefix=RV64_ZVFH
+// FIXME: SVE cannot emit VSCALE.
+
+
+float af32[4096];
+float bf32[4096];
+float cf32[4096];
+// ARMV8-LABEL: define dso_local void @f32min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @af32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, 
!tbaa [[TBAA6:![0-9]+]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @bf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <4 x float> 
@llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <4 x float> 
@llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @cf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa 
[[TBAA6]]
+// ARMV8-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa 
[[TBAA6]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f32min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @af32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr 
[[TMP2]], align 4, !tbaa [[TBAA9:![0-9]+]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @bf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr 
[[TMP3]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> 
@llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x 
float> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @cf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 
4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f32min() {
+       for (int i=0; i<4096; i++) {cf32[i] = __builtin_fminimum_numf(af32[i], 
bf32[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f32max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @af32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, 
!tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @bf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 
4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <4 x float> 
@llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <4 x float> 
@llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], 
ptr @cf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa 
[[TBAA6]]
+// ARMV8-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa 
[[TBAA6]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f32max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @af32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr 
[[TMP2]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @bf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr 
[[TMP3]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> 
@llvm.maximumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x 
float> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x 
float], ptr @cf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 
4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f32max() {
+       for (int i=0; i<4096; i++) {cf32[i] = __builtin_fmaximum_numf(af32[i], 
bf32[i]);}
+}
+
+double af64[4096];
+double bf64[4096];
+double cf64[4096];
+// ARMV8-LABEL: define dso_local void @f64min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @af64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 
8, !tbaa [[TBAA15:![0-9]+]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @bf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <2 x double> 
@llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <2 x double> 
@llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> 
[[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @cf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa 
[[TBAA15]]
+// ARMV8-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa 
[[TBAA15]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f64min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @af64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr 
[[TMP2]], align 8, !tbaa [[TBAA18:![0-9]+]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @bf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr 
[[TMP3]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> 
@llvm.minimumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x 
double> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @cf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], 
align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f64min() {
+       for (int i=0; i<4096; i++) {cf64[i] = __builtin_fminimum_num(af64[i], 
bf64[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f64max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @af64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @bf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 
8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <2 x double> 
@llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <2 x double> 
@llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> 
[[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], 
ptr @cf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa 
[[TBAA15]]
+// ARMV8-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa 
[[TBAA15]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f64max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @af64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr 
[[TMP2]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @bf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr 
[[TMP3]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> 
@llvm.maximumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x 
double> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x 
double], ptr @cf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], 
align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f64max() {
+       for (int i=0; i<4096; i++) {cf64[i] = __builtin_fmaximum_num(af64[i], 
bf64[i]);}
+}
+
+__fp16 af16[4096];
+__fp16 bf16[4096];
+__fp16 cf16[4096];
+// ARMV8-LABEL: define dso_local void @f16min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr 
@af16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2, 
!tbaa [[TBAA19:![0-9]+]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x half>, ptr [[TMP1]], align 
2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr 
@bf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x half>, ptr [[TMP2]], align 
2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x half>, ptr [[TMP3]], align 
2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <8 x half> 
@llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <8 x half> 
@llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD11]], <8 x half> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr 
@cf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <8 x half> [[TMP4]], ptr [[TMP6]], align 2, !tbaa 
[[TBAA19]]
+// ARMV8-NEXT:    store <8 x half> [[TMP5]], ptr [[TMP7]], align 2, !tbaa 
[[TBAA19]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f16min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ugt i64 
[[TMP0]], 512
+// RV64_ZVFH-NEXT:    br i1 [[MIN_ITERS_CHECK]], label 
%[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+// RV64_ZVFH:       [[VECTOR_PH]]:
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[DOTNEG:%.*]] = mul nuw nsw i64 [[TMP1]], 8184
+// RV64_ZVFH-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], 4096
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], 
ptr @af16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr 
[[TMP4]], align 2, !tbaa [[TBAA22:![0-9]+]]
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], 
ptr @bf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x half>, ptr 
[[TMP5]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> 
@llvm.minimumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> 
[[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], 
ptr @cf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 8 x half> [[TMP6]], ptr [[TMP7]], align 
2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+// RV64_ZVFH-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+// RV64_ZVFH-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+// RV64_ZVFH:       [[MIDDLE_BLOCK]]:
+// RV64_ZVFH-NEXT:    [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0
+// RV64_ZVFH-NEXT:    br i1 [[CMP_N_NOT]], label %[[FOR_BODY_PREHEADER]], 
label %[[FOR_COND_CLEANUP:.*]]
+// RV64_ZVFH:       [[FOR_BODY_PREHEADER]]:
+// RV64_ZVFH-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[N_VEC]], %[[MIDDLE_BLOCK]] ]
+// RV64_ZVFH-NEXT:    br label %[[FOR_BODY:.*]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+// RV64_ZVFH:       [[FOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], 
%[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ]
+// RV64_ZVFH-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x 
half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX]], align 2, 
!tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x 
half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT:    [[TMP10:%.*]] = load half, ptr [[ARRAYIDX2]], align 2, 
!tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half 
[[TMP9]], half [[TMP10]])
+// RV64_ZVFH-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x 
half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT:    store half [[TMP11]], ptr [[ARRAYIDX4]], align 2, !tbaa 
[[TBAA22]]
+// RV64_ZVFH-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+// RV64_ZVFH-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 
4096
+// RV64_ZVFH-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], 
label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+//
+void f16min() {
+       for (int i=0; i<4096; i++) {cf16[i] = 
__builtin_fminimum_numf16(af16[i], bf16[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f16max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr 
@af16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2, 
!tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x half>, ptr [[TMP1]], align 
2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr 
@bf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], 
i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x half>, ptr [[TMP2]], align 
2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x half>, ptr [[TMP3]], align 
2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <8 x half> 
@llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <8 x half> 
@llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD11]], <8 x half> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr 
@cf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], 
i64 16
+// ARMV8-NEXT:    store <8 x half> [[TMP4]], ptr [[TMP6]], align 2, !tbaa 
[[TBAA19]]
+// ARMV8-NEXT:    store <8 x half> [[TMP5]], ptr [[TMP7]], align 2, !tbaa 
[[TBAA19]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f16max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ugt i64 
[[TMP0]], 512
+// RV64_ZVFH-NEXT:    br i1 [[MIN_ITERS_CHECK]], label 
%[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+// RV64_ZVFH:       [[VECTOR_PH]]:
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[DOTNEG:%.*]] = mul nuw nsw i64 [[TMP1]], 8184
+// RV64_ZVFH-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], 4096
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], 
ptr @af16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr 
[[TMP4]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], 
ptr @bf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x half>, ptr 
[[TMP5]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> 
@llvm.maximumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> 
[[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], 
ptr @cf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 8 x half> [[TMP6]], ptr [[TMP7]], align 
2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+// RV64_ZVFH-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+// RV64_ZVFH-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+// RV64_ZVFH:       [[MIDDLE_BLOCK]]:
+// RV64_ZVFH-NEXT:    [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0
+// RV64_ZVFH-NEXT:    br i1 [[CMP_N_NOT]], label %[[FOR_BODY_PREHEADER]], 
label %[[FOR_COND_CLEANUP:.*]]
+// RV64_ZVFH:       [[FOR_BODY_PREHEADER]]:
+// RV64_ZVFH-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 
[[N_VEC]], %[[MIDDLE_BLOCK]] ]
+// RV64_ZVFH-NEXT:    br label %[[FOR_BODY:.*]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+// RV64_ZVFH:       [[FOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], 
%[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ]
+// RV64_ZVFH-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x 
half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX]], align 2, 
!tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x 
half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT:    [[TMP10:%.*]] = load half, ptr [[ARRAYIDX2]], align 2, 
!tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half 
[[TMP9]], half [[TMP10]])
+// RV64_ZVFH-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x 
half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT:    store half [[TMP11]], ptr [[ARRAYIDX4]], align 2, !tbaa 
[[TBAA22]]
+// RV64_ZVFH-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+// RV64_ZVFH-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 
4096
+// RV64_ZVFH-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], 
label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+//
+void f16max() {
+       for (int i=0; i<4096; i++) {cf16[i] = 
__builtin_fmaximum_numf16(af16[i], bf16[i]);}
+}
+
+//.
+// ARMV8: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// ARMV8: [[META7]] = !{!"float", [[META8:![0-9]+]], i64 0}
+// ARMV8: [[META8]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// ARMV8: [[META9]] = !{!"Simple C/C++ TBAA"}
+// ARMV8: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]], 
[[META12:![0-9]+]], [[META13:![0-9]+]]}
+// ARMV8: [[META11]] = !{!"llvm.loop.mustprogress"}
+// ARMV8: [[META12]] = !{!"llvm.loop.isvectorized", i32 1}
+// ARMV8: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"}
+// ARMV8: [[LOOP14]] = distinct !{[[LOOP14]], [[META11]], [[META12]], 
[[META13]]}
+// ARMV8: [[TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// ARMV8: [[META16]] = !{!"double", [[META8]], i64 0}
+// ARMV8: [[LOOP17]] = distinct !{[[LOOP17]], [[META11]], [[META12]], 
[[META13]]}
+// ARMV8: [[LOOP18]] = distinct !{[[LOOP18]], [[META11]], [[META12]], 
[[META13]]}
+// ARMV8: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
+// ARMV8: [[META20]] = !{!"__fp16", [[META8]], i64 0}
+// ARMV8: [[LOOP21]] = distinct !{[[LOOP21]], [[META11]], [[META12]], 
[[META13]]}
+// ARMV8: [[LOOP22]] = distinct !{[[LOOP22]], [[META11]], [[META12]], 
[[META13]]}
+//.
+// RV64_ZVFH: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// RV64_ZVFH: [[META10]] = !{!"float", [[META11:![0-9]+]], i64 0}
+// RV64_ZVFH: [[META11]] = !{!"omnipotent char", [[META12:![0-9]+]], i64 0}
+// RV64_ZVFH: [[META12]] = !{!"Simple C/C++ TBAA"}
+// RV64_ZVFH: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], 
[[META15:![0-9]+]], [[META16:![0-9]+]]}
+// RV64_ZVFH: [[META14]] = !{!"llvm.loop.mustprogress"}
+// RV64_ZVFH: [[META15]] = !{!"llvm.loop.isvectorized", i32 1}
+// RV64_ZVFH: [[META16]] = !{!"llvm.loop.unroll.runtime.disable"}
+// RV64_ZVFH: [[LOOP17]] = distinct !{[[LOOP17]], [[META14]], [[META15]], 
[[META16]]}
+// RV64_ZVFH: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// RV64_ZVFH: [[META19]] = !{!"double", [[META11]], i64 0}
+// RV64_ZVFH: [[LOOP20]] = distinct !{[[LOOP20]], [[META14]], [[META15]], 
[[META16]]}
+// RV64_ZVFH: [[LOOP21]] = distinct !{[[LOOP21]], [[META14]], [[META15]], 
[[META16]]}
+// RV64_ZVFH: [[TBAA22]] = !{[[META23:![0-9]+]], [[META23]], i64 0}
+// RV64_ZVFH: [[META23]] = !{!"__fp16", [[META11]], i64 0}
+// RV64_ZVFH: [[LOOP24]] = distinct !{[[LOOP24]], [[META14]], [[META15]], 
[[META16]]}
+// RV64_ZVFH: [[LOOP25]] = distinct !{[[LOOP25]], [[META14]], [[META16]], 
[[META15]]}
+// RV64_ZVFH: [[LOOP26]] = distinct !{[[LOOP26]], [[META14]], [[META15]], 
[[META16]]}
+// RV64_ZVFH: [[LOOP27]] = distinct !{[[LOOP27]], [[META14]], [[META16]], 
[[META15]]}
+//.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h 
b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index d46859bcb0517..db6d77aab70f7 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2776,6 +2776,12 @@ class BasicTTIImplBase : public 
TargetTransformInfoImplCRTPBase<T> {
       }
       return Cost;
     }
+    case Intrinsic::maximumnum:
+    case Intrinsic::minimumnum: {
+      if (TLI->isOperationLegalOrPromote(llvm::ISD::FMAXNUM_IEEE, LT.second))
+        return LT.first * 3;
+      break;
+    }
     default:
       break;
     }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp 
b/llvm/lib/Analysis/VectorUtils.cpp
index 91ba68fe03324..422058be22edb 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -89,6 +89,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::maxnum:
   case Intrinsic::minimum:
   case Intrinsic::maximum:
+  case Intrinsic::minimumnum:
+  case Intrinsic::maximumnum:
   case Intrinsic::modf:
   case Intrinsic::copysign:
   case Intrinsic::floor:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp 
b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 27a4bbce1f5fc..f393f5681d641 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -961,6 +961,8 @@ RISCVTargetLowering::RISCVTargetLowering(const 
TargetMachine &TM,
     static const unsigned ZvfhminZvfbfminPromoteOps[] = {
         ISD::FMINNUM,
         ISD::FMAXNUM,
+        ISD::FMINIMUMNUM,
+        ISD::FMAXIMUMNUM,
         ISD::FADD,
         ISD::FSUB,
         ISD::FMUL,
@@ -1029,7 +1031,9 @@ RISCVTargetLowering::RISCVTargetLowering(const 
TargetMachine &TM,
       // Expand various condition codes (explained above).
       setCondCodeAction(VFPCCToExpand, VT, Expand);
 
-      setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, VT, Legal);
+      setOperationAction(
+          {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMAXIMUMNUM, ISD::FMINIMUMNUM}, VT,
+          Legal);
       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, VT, Custom);
 
       setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND,
@@ -1444,7 +1448,8 @@ RISCVTargetLowering::RISCVTargetLowering(const 
TargetMachine &TM,
         setOperationAction({ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
                             ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN, ISD::FSQRT,
                             ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
-                            ISD::IS_FPCLASS, ISD::FMAXIMUM, ISD::FMINIMUM},
+                            ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM, 
ISD::IS_FPCLASS,
+                            ISD::FMAXIMUM, ISD::FMINIMUM},
                            VT, Custom);
 
         setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND,
@@ -6688,9 +6693,11 @@ static unsigned getRISCVVLOp(SDValue Op) {
   case ISD::VP_FP_TO_UINT:
     return RISCVISD::VFCVT_RTZ_XU_F_VL;
   case ISD::FMINNUM:
+  case ISD::FMINIMUMNUM:
   case ISD::VP_FMINNUM:
     return RISCVISD::VFMIN_VL;
   case ISD::FMAXNUM:
+  case ISD::FMAXIMUMNUM:
   case ISD::VP_FMAXNUM:
     return RISCVISD::VFMAX_VL;
   case ISD::LRINT:
@@ -7720,6 +7727,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::FMA:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
     if (isPromotedOpNeedingSplit(Op, Subtarget))
       return SplitVectorOp(Op, DAG);
     [[fallthrough]];

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to