Issue |
137213
|
Summary |
[X86][LV] Zen4: Loop Vectorization with masked memops slower than scalar
|
Labels |
|
Assignees |
|
Reporter |
mcinally
|
While investigating a performance issue in SPEC CPU2006 481.wrf (WRF version 2.0.2) on Zen4, we discovered that loop vectorization in the SINT subroutine (loops sint.f90:95-212) is slower than the equivalent scalar code. This appears to be caused by masked load/store/gather/scatter costing during LoopVectorize.
It's also worth noting that it looks like AOCC 5.0.0 avoids this issue through [BOSCC](https://reviews.llvm.org/D139074). Is there still interest in upstreaming that work? That would be interesting.
Here's an IR reproducer for one particular masked gather/scatter issue in SINT (there are others):
```
; opt -passes=loop-vectorize -pass-remarks=loop-vectorize repro.ll -S -o repro.llvm
; ModuleID = 'FIRModule'
source_filename = "FIRModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
define void @foo_(ptr nocapture readonly %0, ptr nocapture writeonly %1, ptr nocapture readonly %2, ptr nocapture readonly %3, ptr nocapture readonly %4, ptr nocapture readonly %5, ptr nocapture readonly %6, ptr nocapture readonly %7) local_unnamed_addr #0 !dbg !8 {
%9 = load i32, ptr %0, align 4, !tbaa !23
%10 = sext i32 %9 to i64
%11 = tail call i64 @llvm.smax.i64(i64 %10, i64 0)
%12 = icmp sgt i32 %9, 0
br i1 %12, label %.preheader13.lr.ph, label %._crit_edge
.preheader13.lr.ph: ; preds = %8
%13 = mul nuw nsw i64 %11, %11
br label %.preheader13.us
.preheader13.us: ; preds = %._crit_edge14.split.us.us, %.preheader13.lr.ph
%indvars.iv27 = phi i64 [ %indvars.iv.next28, %._crit_edge14.split.us.us ], [ 1, %.preheader13.lr.ph ]
%14 = add nsw i64 %indvars.iv27, -1
%15 = mul nuw nsw i64 %14, %11
%16 = getelementptr i32, ptr %7, i64 %15
%17 = mul nsw i64 %14, %13
%18 = add i64 %17, -1
br label %.preheader.us.us
.preheader.us.us: ; preds = %._crit_edge.us.us, %.preheader13.us
%indvars.iv23 = phi i64 [ %indvars.iv.next24, %._crit_edge.us.us ], [ 1, %.preheader13.us ]
%19 = add nsw i64 %indvars.iv23, -1
%20 = getelementptr i32, ptr %16, i64 %19
%21 = load i32, ptr %20, align 4, !tbaa !73
%.not.us.us = icmp eq i32 %21, 0
%22 = mul nuw nsw i64 %19, %11
%23 = add i64 %18, %22
br i1 %.not.us.us, label %.lr.ph.split.us.us.us.preheader, label %.lr.ph.split.us16.us.preheader
.lr.ph.split.us16.us.preheader: ; preds = %.preheader.us.us
br label %.lr.ph.split.us16.us
.lr.ph.split.us.us.us.preheader: ; preds = %.preheader.us.us
br label %.lr.ph.split.us.us.us
.lr.ph.split.us16.us: ; preds = %.lr.ph.split.us16.us.preheader, %.lr.ph.split.us16.us
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph.split.us16.us ], [ 1, %.lr.ph.split.us16.us.preheader ]
%24 = add i64 %23, %indvars.iv
%25 = getelementptr i32, ptr %4, i64 %24
%26 = load i32, ptr %25, align 4, !tbaa !79
%27 = sext i32 %26 to i64
%28 = getelementptr i32, ptr %5, i64 %24
%29 = load i32, ptr %28, align 4, !tbaa !81
%30 = sext i32 %29 to i64
%31 = getelementptr i32, ptr %6, i64 %24
%32 = load i32, ptr %31, align 4, !tbaa !83
%33 = sext i32 %32 to i64
%34 = add nsw i64 %27, -1
%35 = add nsw i64 %30, -1
%36 = mul nsw i64 %35, %11
%37 = add nsw i64 %34, %36
%38 = add nsw i64 %33, -1
%39 = mul nsw i64 %38, %13
%40 = add nsw i64 %37, %39
%41 = getelementptr float, ptr %2, i64 %40
%42 = load float, ptr %41, align 4, !tbaa !85
%43 = getelementptr float, ptr %3, i64 %40
%44 = load float, ptr %43, align 4, !tbaa !87
%45 = fadd fast float %44, %42
%46 = getelementptr float, ptr %1, i64 %40
store float %45, ptr %46, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv, %10
br i1 %exitcond.not, label %._crit_edge.us.us.loopexit32, label %.lr.ph.split.us16.us
._crit_edge.us.us.loopexit: ; preds = %.lr.ph.split.us.us.us
br label %._crit_edge.us.us
._crit_edge.us.us.loopexit32: ; preds = %.lr.ph.split.us16.us
br label %._crit_edge.us.us
._crit_edge.us.us: ; preds = %._crit_edge.us.us.loopexit32, %._crit_edge.us.us.loopexit
%indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
%exitcond26.not = icmp eq i64 %indvars.iv23, %10
br i1 %exitcond26.not, label %._crit_edge14.split.us.us, label %.preheader.us.us
.lr.ph.split.us.us.us: ; preds = %.lr.ph.split.us.us.us.preheader, %.lr.ph.split.us.us.us
%indvars.iv19 = phi i64 [ %indvars.iv.next20, %.lr.ph.split.us.us.us ], [ 1, %.lr.ph.split.us.us.us.preheader ]
%47 = add i64 %23, %indvars.iv19
%48 = getelementptr i32, ptr %4, i64 %47
%49 = load i32, ptr %48, align 4, !tbaa !79
%50 = sext i32 %49 to i64
%51 = getelementptr i32, ptr %5, i64 %47
%52 = load i32, ptr %51, align 4, !tbaa !81
%53 = sext i32 %52 to i64
%54 = getelementptr i32, ptr %6, i64 %47
%55 = load i32, ptr %54, align 4, !tbaa !83
%56 = sext i32 %55 to i64
%57 = add nsw i64 %50, -1
%58 = add nsw i64 %53, -1
%59 = mul nsw i64 %58, %11
%60 = add nsw i64 %57, %59
%61 = add nsw i64 %56, -1
%62 = mul nsw i64 %61, %13
%63 = add nsw i64 %60, %62
%64 = getelementptr float, ptr %2, i64 %63
%65 = load float, ptr %64, align 4, !tbaa !85
%66 = getelementptr float, ptr %3, i64 %63
%67 = load float, ptr %66, align 4, !tbaa !87
%68 = fsub fast float %65, %67
%69 = getelementptr float, ptr %1, i64 %63
store float %68, ptr %69, align 4, !tbaa !89
%indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
%exitcond22.not = icmp eq i64 %indvars.iv19, %10
br i1 %exitcond22.not, label %._crit_edge.us.us.loopexit, label %.lr.ph.split.us.us.us
._crit_edge14.split.us.us: ; preds = %._crit_edge.us.us
%indvars.iv.next28 = add nuw nsw i64 %indvars.iv27, 1
%exitcond30.not = icmp eq i64 %indvars.iv27, %10
br i1 %exitcond30.not, label %._crit_edge.loopexit, label %.preheader13.us
._crit_edge.loopexit: ; preds = %._crit_edge14.split.us.us
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %8
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.smax.i64(i64, i64) #1
attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "approx-func-fp-math"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-cpu"="znver4" "unsafe-fp-math"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!7}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 1, !"Code Model", i32 4}
!4 = !{i32 1, !"Large Data Threshold", i64 0}
!5 = distinct !DICompileUnit(language: DW_LANG_Fortran95, file: !6, producer: "flang version 20.1.0 (https://github.com/llvm/llvm-project.git 24a30daaa559829ad079f2ff7f73eb4e18095f88)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
!6 = !DIFile(filename: "test.f90", directory: "/home/cmcinally")
!7 = !{!"flang version 20.1.0 (https://github.com/llvm/llvm-project.git 24a30daaa559829ad079f2ff7f73eb4e18095f88)"}
!8 = distinct !DISubprogram(name: "foo", linkageName: "foo_", scope: !6, file: !6, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !5)
!9 = !DISubroutineType(cc: DW_CC_normal, types: !10)
!10 = !{null, !11, !12, !12, !12, !16, !16, !16, !17}
!11 = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
!12 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, elements: !14)
!13 = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
!14 = !{!15, !15, !15}
!15 = !DISubrange()
!16 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, elements: !14)
!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, elements: !19)
!18 = !DIBasicType(name: "logical", size: 32, encoding: DW_ATE_boolean)
!19 = !{!15, !15}
!20 = !DILocalVariable(name: "n", arg: 1, scope: !8, file: !6, line: 3, type: !11)
!23 = !{!24, !24, i64 0}
!24 = !{!"dummy arg data/_QFfooEn", !25, i64 0}
!25 = !{!"dummy arg data", !26, i64 0}
!26 = !{!"any data access", !27, i64 0}
!27 = !{!"any access", !28, i64 0}
!28 = !{!"Flang function root _QPfoo"}
!29 = !DILocalVariable(name: "._QFfooEarr11", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!30 = !DIBasicType(name: "integer", size: 64, encoding: DW_ATE_signed)
!31 = !DILocalVariable(name: "arr1", arg: 2, scope: !8, file: !6, line: 4, type: !32)
!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, elements: !33)
!33 = !{!34, !34, !34}
!34 = !DISubrange(count: !29)
!35 = !DILocalVariable(name: "._QFfooEarr21", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!37 = !DILocalVariable(name: "arr2", arg: 3, scope: !8, file: !6, line: 4, type: !38)
!38 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, elements: !39)
!39 = !{!40, !40, !40}
!40 = !DISubrange(count: !35)
!41 = !DILocalVariable(name: "._QFfooEarr31", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!43 = !DILocalVariable(name: "arr3", arg: 4, scope: !8, file: !6, line: 4, type: !44)
!44 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, elements: !45)
!45 = !{!46, !46, !46}
!46 = !DISubrange(count: !41)
!47 = !DILocalVariable(name: "._QFfooEicmask1", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!49 = !DILocalVariable(name: "icmask", arg: 8, scope: !8, file: !6, line: 6, type: !50)
!50 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, elements: !51)
!51 = !{!52, !52}
!52 = !DISubrange(count: !47)
!53 = !DILocalVariable(name: "._QFfooEx1", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!55 = !DILocalVariable(name: "x", arg: 5, scope: !8, file: !6, line: 5, type: !56)
!56 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, elements: !57)
!57 = !{!58, !58, !58}
!58 = !DISubrange(count: !53)
!59 = !DILocalVariable(name: "._QFfooEy1", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!61 = !DILocalVariable(name: "y", arg: 6, scope: !8, file: !6, line: 5, type: !62)
!62 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, elements: !63)
!63 = !{!64, !64, !64}
!64 = !DISubrange(count: !59)
!65 = !DILocalVariable(name: "._QFfooEz1", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!67 = !DILocalVariable(name: "z", arg: 7, scope: !8, file: !6, line: 5, type: !68)
!68 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, elements: !69)
!69 = !{!70, !70, !70}
!70 = !DISubrange(count: !65)
!73 = !{!74, !74, i64 0}
!74 = !{!"dummy arg data/_QFfooEicmask", !25, i64 0}
!79 = !{!80, !80, i64 0}
!80 = !{!"dummy arg data/_QFfooEx", !25, i64 0}
!81 = !{!82, !82, i64 0}
!82 = !{!"dummy arg data/_QFfooEy", !25, i64 0}
!83 = !{!84, !84, i64 0}
!84 = !{!"dummy arg data/_QFfooEz", !25, i64 0}
!85 = !{!86, !86, i64 0}
!86 = !{!"dummy arg data/_QFfooEarr2", !25, i64 0}
!87 = !{!88, !88, i64 0}
!88 = !{!"dummy arg data/_QFfooEarr3", !25, i64 0}
!89 = !{!90, !90, i64 0}
!90 = !{!"dummy arg data/_QFfooEarr1", !25, i64 0}
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs