llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: YunQiang Su (wzssyqa) <details> <summary>Changes</summary> Support auto-vectorize for fminimum_num and fmaximum_num. For ARM64 with SVE, scalable vector cannot support yet, and For RISCV Vector, scalable vector works well now. --- Patch is 33.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131781.diff 4 Files Affected: - (added) clang/test/CodeGen/fminimum-num-autovec.c (+407) - (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+6) - (modified) llvm/lib/Analysis/VectorUtils.cpp (+2) - (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+11-2) ``````````diff diff --git a/clang/test/CodeGen/fminimum-num-autovec.c b/clang/test/CodeGen/fminimum-num-autovec.c new file mode 100644 index 0000000000000..94114b6227d27 --- /dev/null +++ b/clang/test/CodeGen/fminimum-num-autovec.c @@ -0,0 +1,407 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang --target=aarch64-unknown-linux-gnu -march=armv8+fp16 %s -O3 -emit-llvm -S -o - | FileCheck %s --check-prefix=ARMV8 +// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64gv_zvfh %s -O3 -emit-llvm -S -o - | FileCheck %s --check-prefix=RV64_ZVFH +// FIXME: SVE cannot emit VSCALE. + + +float af32[4096]; +float bf32[4096]; +float cf32[4096]; +// ARMV8-LABEL: define dso_local void @f32min( +// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// ARMV8-NEXT: [[ENTRY:.*]]: +// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]] +// ARMV8: [[VECTOR_BODY]]: +// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA6:![0-9]+]] +// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: [[TMP4:%.*]] = tail call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) +// ARMV8-NEXT: [[TMP5:%.*]] = tail call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]]) +// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16 +// ARMV8-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +// ARMV8: [[FOR_COND_CLEANUP]]: +// ARMV8-NEXT: ret void +// +// RV64_ZVFH-LABEL: define dso_local void @f32min( +// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// RV64_ZVFH-NEXT: [[ENTRY:.*]]: +// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]] +// RV64_ZVFH: [[VECTOR_BODY]]: +// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA9:![0-9]+]] +// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA9]] +// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD10]]) +// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA9]] +// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +// RV64_ZVFH: [[FOR_COND_CLEANUP]]: +// RV64_ZVFH-NEXT: ret void +// +void f32min() { + for (int i=0; i<4096; i++) {cf32[i] = __builtin_fminimum_numf(af32[i], bf32[i]);} +} +// ARMV8-LABEL: define dso_local void @f32max( +// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] { +// ARMV8-NEXT: [[ENTRY:.*]]: +// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]] +// ARMV8: [[VECTOR_BODY]]: +// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: [[TMP4:%.*]] = tail call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) +// ARMV8-NEXT: [[TMP5:%.*]] = tail call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]]) +// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16 +// ARMV8-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa [[TBAA6]] +// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +// ARMV8: [[FOR_COND_CLEANUP]]: +// ARMV8-NEXT: ret void +// +// RV64_ZVFH-LABEL: define dso_local void @f32max( +// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] { +// RV64_ZVFH-NEXT: [[ENTRY:.*]]: +// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]] +// RV64_ZVFH: [[VECTOR_BODY]]: +// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA9]] +// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA9]] +// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.maximumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD10]]) +// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA9]] +// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +// RV64_ZVFH: [[FOR_COND_CLEANUP]]: +// RV64_ZVFH-NEXT: ret void +// +void f32max() { + for (int i=0; i<4096; i++) {cf32[i] = __builtin_fmaximum_numf(af32[i], bf32[i]);} +} + +double af64[4096]; +double bf64[4096]; +double cf64[4096]; +// ARMV8-LABEL: define dso_local void @f64min( +// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] { +// ARMV8-NEXT: [[ENTRY:.*]]: +// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]] +// ARMV8: [[VECTOR_BODY]]: +// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA15:![0-9]+]] +// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: [[TMP4:%.*]] = tail call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]]) +// ARMV8-NEXT: [[TMP5:%.*]] = tail call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> [[WIDE_LOAD13]]) +// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16 +// ARMV8-NEXT: store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +// ARMV8: [[FOR_COND_CLEANUP]]: +// ARMV8-NEXT: ret void +// +// RV64_ZVFH-LABEL: define dso_local void @f64min( +// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] { +// RV64_ZVFH-NEXT: [[ENTRY:.*]]: +// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]] +// RV64_ZVFH: [[VECTOR_BODY]]: +// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA18:![0-9]+]] +// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA18]] +// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.minimumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD10]]) +// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], align 8, !tbaa [[TBAA18]] +// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +// RV64_ZVFH: [[FOR_COND_CLEANUP]]: +// RV64_ZVFH-NEXT: ret void +// +void f64min() { + for (int i=0; i<4096; i++) {cf64[i] = __builtin_fminimum_num(af64[i], bf64[i]);} +} +// ARMV8-LABEL: define dso_local void @f64max( +// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] { +// ARMV8-NEXT: [[ENTRY:.*]]: +// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]] +// ARMV8: [[VECTOR_BODY]]: +// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: [[TMP4:%.*]] = tail call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]]) +// ARMV8-NEXT: [[TMP5:%.*]] = tail call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> [[WIDE_LOAD13]]) +// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16 +// ARMV8-NEXT: store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa [[TBAA15]] +// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +// ARMV8: [[FOR_COND_CLEANUP]]: +// ARMV8-NEXT: ret void +// +// RV64_ZVFH-LABEL: define dso_local void @f64max( +// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] { +// RV64_ZVFH-NEXT: [[ENTRY:.*]]: +// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]] +// RV64_ZVFH: [[VECTOR_BODY]]: +// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA18]] +// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA18]] +// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.maximumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD10]]) +// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], align 8, !tbaa [[TBAA18]] +// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +// RV64_ZVFH: [[FOR_COND_CLEANUP]]: +// RV64_ZVFH-NEXT: ret void +// +void f64max() { + for (int i=0; i<4096; i++) {cf64[i] = __builtin_fmaximum_num(af64[i], bf64[i]);} +} + +__fp16 af16[4096]; +__fp16 bf16[4096]; +__fp16 cf16[4096]; +// ARMV8-LABEL: define dso_local void @f16min( +// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] { +// ARMV8-NEXT: [[ENTRY:.*]]: +// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]] +// ARMV8: [[VECTOR_BODY]]: +// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2, !tbaa [[TBAA19:![0-9]+]] +// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x half>, ptr [[TMP1]], align 2, !tbaa [[TBAA19]] +// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16 +// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x half>, ptr [[TMP2]], align 2, !tbaa [[TBAA19]] +// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x half>, ptr [[TMP3]], align 2, !tbaa [[TBAA19]] +// ARMV8-NEXT: [[TMP4:%.*]] = tail call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD12]]) +// ARMV8-NEXT: [[TMP5:%.*]] = tail call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD11]], <8 x half> [[WIDE_LOAD13]]) +// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]] +// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16 +// ARMV8-NEXT: store <8 x half> [[TMP4]], ptr [[TMP6]], align 2, !tbaa [[TBAA19]] +// ARMV8-NEXT: store <8 x half> [[TMP5]], ptr [[TMP7]], align 2, !tbaa [[TBAA19]] +// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +// ARMV8: [[FOR_COND_CLEANUP]]: +// ARMV8-NEXT: ret void +// +// RV64_ZVFH-LABEL: define dso_local void @f16min( +// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] { +// RV64_ZVFH-NEXT: [[ENTRY:.*]]: +// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// RV64_ZVFH-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ugt i64 [[TMP0]], 512 +// RV64_ZVFH-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]] +// RV64_ZVFH: [[VECTOR_PH]]: +// RV64_ZVFH-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// RV64_ZVFH-NEXT: [[DOTNEG:%.*]] = mul nuw nsw i64 [[TMP1]], 8184 +// RV64_ZVFH-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], 4096 +// RV64_ZVFH-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +// RV64_ZVFH-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3 +// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]] +// RV64_ZVFH: [[VECTOR_BODY]]: +// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +// RV64_ZVFH-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2, !tbaa [[TBAA22:![0-9]+]] +// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 8 x half>, ptr [[TMP5]], align 2, !tbaa [[TBAA22]] +// RV64_ZVFH-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.minimumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD10]]) +// RV64_ZVFH-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]] +// RV64_ZVFH-NEXT: store <vscale x 8 x half> [[TMP6]], ptr [[TMP7]], align 2, !tbaa [[TBAA22]] +// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +// RV64_ZVFH-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +// RV64_ZVFH-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +// RV64_ZVFH: [[MIDDLE_BLOCK]]: +// RV64_ZVFH-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0 +// RV64_ZVFH-NEXT: br i1 [[CMP_N_NOT]], label %[[FOR_BODY_PREHEADER]], label %[[F... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/131781 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits