-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[LV] Fix branch weights in epilogue min iteration check block #152534
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
I've changed how we construct the EpilogueVectorizerEpilogueLoop and EpilogueVectorizerMainLoop classes so that we construct the parent class with an additional boolean parameter indicating whether we're vectorising the main or epilogue loop. The InnerLoopAndEpilogueVectorizer class uses this new argument in combination with the EpilogueLoopVectorizationInfo struct to set the right UF and VF values. This then allows EpilogueVectorizerEpilogueLoop to access the correct values of VF and UF for the main loop, which are required when setting branch weights in the minimum iteration check block.
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: David Sherwood (david-arm) ChangesI've changed how we construct the EpilogueVectorizerEpilogueLoop and EpilogueVectorizerMainLoop classes so that we construct the parent class with an additional boolean parameter indicating whether we're vectorising the main or epilogue loop. The InnerLoopAndEpilogueVectorizer class uses this new argument in combination with the EpilogueLoopVectorizationInfo struct to set the right UF and VF values. This then allows EpilogueVectorizerEpilogueLoop to access the correct values of VF and UF for the main loop, which are required when setting branch weights in the minimum iteration check block. Full diff: https://github.com/llvm/llvm-project/pull/152534.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index be00fd6a416e5..f76872587e6c7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -690,9 +690,12 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan,
+ bool ForMainLoop)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
+ ForMainLoop ? EPI.MainLoopVF : EPI.EpilogueVF,
+ ForMainLoop ? EPI.MainLoopVF : EPI.EpilogueVF,
+ ForMainLoop ? EPI.MainLoopUF : EPI.EpilogueUF, CM,
BFI, PSI, Checks, Plan),
EPI(EPI) {}
@@ -729,7 +732,8 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, CM, BFI, PSI, Check, Plan) {}
+ EPI, CM, BFI, PSI, Check, Plan,
+ /*ForMainLoop=*/true) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
@@ -756,7 +760,8 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, CM, BFI, PSI, Checks, Plan) {
+ EPI, CM, BFI, PSI, Checks, Plan,
+ /*ForMainLoop=*/false) {
TripCount = EPI.TripCount;
}
/// Implements the interface for creating a vectorized skeleton using the
@@ -7690,9 +7695,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
- // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
- // think the MainLoopStep is correct.
- unsigned MainLoopStep = UF * VF.getKnownMinValue();
+ unsigned MainLoopStep = EPI.MainLoopUF * EPI.MainLoopVF.getKnownMinValue();
unsigned EpilogueLoopStep =
EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
// We assume the remaining `Count` is equally distributed in
@@ -10339,8 +10342,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Second pass vectorizes the epilogue and adjusts the control flow
// edges from the first pass.
- EPI.MainLoopVF = EPI.EpilogueVF;
- EPI.MainLoopUF = EPI.EpilogueUF;
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &CM, BFI, PSI,
Checks, BestEpiPlan);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index 027a88de14153..9987701e44cde 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -18,7 +18,7 @@ define void @_Z3foov(i64 %n) {
; CHECK-V1-IC1: [[VECTOR_PH]]:
; CHECK-V1-IC1: br label %[[VECTOR_BODY:.*]]
; CHECK-V1-IC1: [[VECTOR_BODY]]:
-; CHECK-V1-IC1: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-V1-IC1: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
; CHECK-V1-IC1: [[MIDDLE_BLOCK]]:
; CHECK-V1-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF4:![0-9]+]]
; CHECK-V1-IC1: [[SCALAR_PH]]:
@@ -34,13 +34,13 @@ define void @_Z3foov(i64 %n) {
; CHECK-V2-IC1: [[VECTOR_PH]]:
; CHECK-V2-IC1: br label %[[VECTOR_BODY:.*]]
; CHECK-V2-IC1: [[VECTOR_BODY]]:
-; CHECK-V2-IC1: br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC1: br i1 [[TMP2:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK-V2-IC1: [[MIDDLE_BLOCK]]:
; CHECK-V2-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
; CHECK-V2-IC1: [[SCALAR_PH]]:
; CHECK-V2-IC1: br label %[[FOR_BODY:.*]]
; CHECK-V2-IC1: [[FOR_BODY]]:
-; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-V2-IC1: [[FOR_COND_CLEANUP]]:
;
; CHECK-V2-IC4-LABEL: define void @_Z3foov(
@@ -52,7 +52,7 @@ define void @_Z3foov(i64 %n) {
; CHECK-V2-IC4: [[VECTOR_PH]]:
; CHECK-V2-IC4: br label %[[VECTOR_BODY:.*]]
; CHECK-V2-IC4: [[VECTOR_BODY]]:
-; CHECK-V2-IC4: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC4: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK-V2-IC4: [[MIDDLE_BLOCK]]:
; CHECK-V2-IC4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
; CHECK-V2-IC4: [[VEC_EPILOG_ITER_CHECK]]:
@@ -60,7 +60,7 @@ define void @_Z3foov(i64 %n) {
; CHECK-V2-IC4: [[VEC_EPILOG_PH]]:
; CHECK-V2-IC4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V2-IC4: br i1 [[TMP15:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC4: br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-V2-IC4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-V2-IC4: br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
; CHECK-V2-IC4: [[VEC_EPILOG_SCALAR_PH]]:
@@ -101,8 +101,9 @@ for.cond.cleanup: ; preds = %for.body
; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V2-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]], [[META3]]}
+; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
;.
; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
@@ -110,7 +111,7 @@ for.cond.cleanup: ; preds = %for.body
; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
-; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 4, i32 0}
+; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 4, i32 12}
; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 3}
; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
index 6892709f085f7..7ae06953c5544 100644
--- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
@@ -4,13 +4,6 @@
; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-epilogue-vectorization \
; RUN: -epilogue-vectorization-force-VF=4 | FileCheck %s --check-prefix=MAINVF4IC2_EPI4
-; FIXME: For MAINVF4IC2_EPI4 the branch weights in the terminator of
-; the VEC_EPILOG_ITER_CHECK block should be [4,4] since we process 8
-; scalar iterations in the main loop, leaving the remaining count to
-; be in the range [0,7]. That gives a 4:4 chance of skipping the
-; vector epilogue. I believe the problem lies in
-; EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck
-; where the main loop VF is set to the same value as the epilogue VF.
define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 {
; MAINVF4IC1_EPI4-LABEL: define void @f0(
; MAINVF4IC1_EPI4-SAME: i8 [[N:%.*]], i32 [[LEN:%.*]], ptr [[P:%.*]]) !prof [[PROF0:![0-9]+]] {
@@ -145,7 +138,7 @@ exit:
; MAINVF4IC2_EPI4: [[META5]] = !{!"llvm.loop.isvectorized", i32 1}
; MAINVF4IC2_EPI4: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"}
; MAINVF4IC2_EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 7}
-; MAINVF4IC2_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 0}
+; MAINVF4IC2_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 4}
; MAINVF4IC2_EPI4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
; MAINVF4IC2_EPI4: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]], [[META6]]}
; MAINVF4IC2_EPI4: [[PROF11]] = !{!"branch_weights", i32 1, i32 3}
|
@@ -690,9 +690,12 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { | |||
const TargetTransformInfo *TTI, AssumptionCache *AC, | |||
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, | |||
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, | |||
ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan) | |||
ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan, | |||
bool ForMainLoop) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit, if the only two callers of this constructor are EpilogueVectorizerMainLoop
and EpilogueVectorizerEpilogueLoop
below, should we just add arguments for VF/MinProfitableTripCount/UnrollFactor and pass them instead of the bool?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that would probably be better
@@ -690,9 +690,12 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { | |||
const TargetTransformInfo *TTI, AssumptionCache *AC, | |||
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, | |||
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, | |||
ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan) | |||
ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan, | |||
bool ForMainLoop) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that would probably be better
// FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't | ||
// think the MainLoopStep is correct. | ||
unsigned MainLoopStep = UF * VF.getKnownMinValue(); | ||
unsigned MainLoopStep = EPI.MainLoopUF * EPI.MainLoopVF.getKnownMinValue(); | ||
unsigned EpilogueLoopStep = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you either update this to use VF/UF, or perhapos better update EpilogueVectorizerMainLoop::emitIterationCountCheck
to not use VF/UF but EPI.MainLoopVF.
for consistency?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
I've changed how we construct the EpilogueVectorizerEpilogueLoop and EpilogueVectorizerMainLoop classes so that we construct the parent class with an additional boolean parameter indicating whether we're vectorising the main or epilogue loop. The InnerLoopAndEpilogueVectorizer class uses this new argument in combination with the EpilogueLoopVectorizationInfo struct to set the right UF and VF values. This then allows EpilogueVectorizerEpilogueLoop to access the correct values of VF and UF for the main loop, which are required when setting branch weights in the minimum iteration check block.