From a2bfddb942f6869ab76581290fe2c321faebd766 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 7 Aug 2025 15:39:15 +0000 Subject: [PATCH 1/2] [LV] Fix branch weights in epilogue min iteration check block I've changed how we construct the EpilogueVectorizerEpilogueLoop and EpilogueVectorizerMainLoop classes so that we construct the parent class with an additional boolean parameter indicating whether we're vectorising the main or epilogue loop. The InnerLoopAndEpilogueVectorizer class uses this new argument in combination with the EpilogueLoopVectorizationInfo struct to set the right UF and VF values. This then allows EpilogueVectorizerEpilogueLoop to access the correct values of VF and UF for the main loop, which are required when setting branch weights in the minimum iteration check block. --- .../Transforms/Vectorize/LoopVectorize.cpp | 19 ++++++++++--------- .../LoopVectorize/AArch64/check-prof-info.ll | 17 +++++++++-------- .../LoopVectorize/branch-weights.ll | 9 +-------- 3 files changed, 20 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index be00fd6a416e5..f76872587e6c7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -690,9 +690,12 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan, + bool ForMainLoop) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM, + ForMainLoop ? EPI.MainLoopVF : EPI.EpilogueVF, + ForMainLoop ? EPI.MainLoopVF : EPI.EpilogueVF, + ForMainLoop ? EPI.MainLoopUF : EPI.EpilogueUF, CM, BFI, PSI, Checks, Plan), EPI(EPI) {} @@ -729,7 +732,8 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, CM, BFI, PSI, Check, Plan) {} + EPI, CM, BFI, PSI, Check, Plan, + /*ForMainLoop=*/true) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final; @@ -756,7 +760,8 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, CM, BFI, PSI, Checks, Plan) { + EPI, CM, BFI, PSI, Checks, Plan, + /*ForMainLoop=*/false) { TripCount = EPI.TripCount; } /// Implements the interface for creating a vectorized skeleton using the @@ -7690,9 +7695,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't - // think the MainLoopStep is correct. - unsigned MainLoopStep = UF * VF.getKnownMinValue(); + unsigned MainLoopStep = EPI.MainLoopUF * EPI.MainLoopVF.getKnownMinValue(); unsigned EpilogueLoopStep = EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); // We assume the remaining `Count` is equally distributed in @@ -10339,8 +10342,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Second pass vectorizes the epilogue and adjusts the control flow // edges from the first pass. - EPI.MainLoopVF = EPI.EpilogueVF; - EPI.MainLoopUF = EPI.EpilogueUF; EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &CM, BFI, PSI, Checks, BestEpiPlan); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll index 027a88de14153..9987701e44cde 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll @@ -18,7 +18,7 @@ define void @_Z3foov(i64 %n) { ; CHECK-V1-IC1: [[VECTOR_PH]]: ; CHECK-V1-IC1: br label %[[VECTOR_BODY:.*]] ; CHECK-V1-IC1: [[VECTOR_BODY]]: -; CHECK-V1-IC1: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-V1-IC1: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK-V1-IC1: [[MIDDLE_BLOCK]]: ; CHECK-V1-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF4:![0-9]+]] ; CHECK-V1-IC1: [[SCALAR_PH]]: @@ -34,13 +34,13 @@ define void @_Z3foov(i64 %n) { ; CHECK-V2-IC1: [[VECTOR_PH]]: ; CHECK-V2-IC1: br label %[[VECTOR_BODY:.*]] ; CHECK-V2-IC1: [[VECTOR_BODY]]: -; CHECK-V2-IC1: br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-V2-IC1: br i1 [[TMP2:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK-V2-IC1: [[MIDDLE_BLOCK]]: ; CHECK-V2-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]] ; CHECK-V2-IC1: [[SCALAR_PH]]: ; CHECK-V2-IC1: br label %[[FOR_BODY:.*]] ; CHECK-V2-IC1: [[FOR_BODY]]: -; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-V2-IC1: [[FOR_COND_CLEANUP]]: ; ; CHECK-V2-IC4-LABEL: define void @_Z3foov( @@ -52,7 +52,7 @@ define void @_Z3foov(i64 %n) { ; CHECK-V2-IC4: [[VECTOR_PH]]: ; CHECK-V2-IC4: br label %[[VECTOR_BODY:.*]] ; CHECK-V2-IC4: [[VECTOR_BODY]]: -; CHECK-V2-IC4: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-V2-IC4: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK-V2-IC4: [[MIDDLE_BLOCK]]: ; CHECK-V2-IC4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]] ; CHECK-V2-IC4: [[VEC_EPILOG_ITER_CHECK]]: @@ -60,7 +60,7 @@ define void @_Z3foov(i64 %n) { ; CHECK-V2-IC4: [[VEC_EPILOG_PH]]: ; CHECK-V2-IC4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-V2-IC4: br i1 [[TMP15:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-V2-IC4: br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-V2-IC4: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-V2-IC4: br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]] ; CHECK-V2-IC4: [[VEC_EPILOG_SCALAR_PH]]: @@ -101,8 +101,9 @@ for.cond.cleanup: ; preds = %for.body ; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]} ; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0} -; CHECK-V2-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]], [[META3]]} +; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3} +; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0} +; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]} ;. ; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127} ; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63} @@ -110,7 +111,7 @@ for.cond.cleanup: ; preds = %for.body ; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15} -; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 4, i32 0} +; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 4, i32 12} ; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]} ; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 3} ; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0} diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll index 6892709f085f7..7ae06953c5544 100644 --- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll +++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll @@ -4,13 +4,6 @@ ; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-epilogue-vectorization \ ; RUN: -epilogue-vectorization-force-VF=4 | FileCheck %s --check-prefix=MAINVF4IC2_EPI4 -; FIXME: For MAINVF4IC2_EPI4 the branch weights in the terminator of -; the VEC_EPILOG_ITER_CHECK block should be [4,4] since we process 8 -; scalar iterations in the main loop, leaving the remaining count to -; be in the range [0,7]. That gives a 4:4 chance of skipping the -; vector epilogue. I believe the problem lies in -; EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck -; where the main loop VF is set to the same value as the epilogue VF. define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 { ; MAINVF4IC1_EPI4-LABEL: define void @f0( ; MAINVF4IC1_EPI4-SAME: i8 [[N:%.*]], i32 [[LEN:%.*]], ptr [[P:%.*]]) !prof [[PROF0:![0-9]+]] { @@ -145,7 +138,7 @@ exit: ; MAINVF4IC2_EPI4: [[META5]] = !{!"llvm.loop.isvectorized", i32 1} ; MAINVF4IC2_EPI4: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"} ; MAINVF4IC2_EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 7} -; MAINVF4IC2_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 0} +; MAINVF4IC2_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 4} ; MAINVF4IC2_EPI4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0} ; MAINVF4IC2_EPI4: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]], [[META6]]} ; MAINVF4IC2_EPI4: [[PROF11]] = !{!"branch_weights", i32 1, i32 3} From ea9d2f98199e719c45e212fd56d1474bedb5c8bc Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 8 Aug 2025 13:04:46 +0000 Subject: [PATCH 2/2] Address review comments --- .../Transforms/Vectorize/LoopVectorize.cpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f76872587e6c7..4c6a96cbb2b01 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -691,12 +691,11 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan, - bool ForMainLoop) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - ForMainLoop ? EPI.MainLoopVF : EPI.EpilogueVF, - ForMainLoop ? EPI.MainLoopVF : EPI.EpilogueVF, - ForMainLoop ? EPI.MainLoopUF : EPI.EpilogueUF, CM, - BFI, PSI, Checks, Plan), + ElementCount VecWidth, ElementCount MinProfitableTripCount, + unsigned UnrollFactor) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, VecWidth, + MinProfitableTripCount, UnrollFactor, CM, BFI, PSI, + Checks, Plan), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -731,9 +730,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan) - : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, CM, BFI, PSI, Check, Plan, - /*ForMainLoop=*/true) {} + : InnerLoopAndEpilogueVectorizer( + OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, CM, BFI, PSI, Check, + Plan, EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final; @@ -759,9 +758,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan) - : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, CM, BFI, PSI, Checks, Plan, - /*ForMainLoop=*/false) { + : InnerLoopAndEpilogueVectorizer( + OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, CM, BFI, PSI, Checks, + Plan, EPI.EpilogueVF, EPI.EpilogueVF, EPI.EpilogueUF) { TripCount = EPI.TripCount; } /// Implements the interface for creating a vectorized skeleton using the @@ -7562,8 +7561,9 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, assert(Bypass && "Expected valid bypass basic block."); Value *Count = getTripCount(); MinProfitableTripCount = ElementCount::getFixed(0); - Value *CheckMinIters = createIterationCountCheck( - ForEpilogue ? EPI.EpilogueVF : VF, ForEpilogue ? EPI.EpilogueUF : UF); + Value *CheckMinIters = + createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF, + ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF); BasicBlock *const TCCheckBlock = LoopVectorPreHeader; if (!ForEpilogue)