Skip to content

Commit d6ce82b

Browse files
committed
[VPlan] Compute interleave count for VPlan.
Move selectInterleaveCount to LoopVectorizationPlanner and retrieve some information directly from VPlan. Register pressure was already computed for a VPlan, and with this patch we now also check for reductions directly on VPlan, as well as checking how many load and store operations remain in the loop. This should be mostly NFC, but we may compute slightly different interleave counts, except for some edge cases, e.g. where dead loads have been removed. This shouldn't happen in practice, and the patch doesn't cause changes across a large test corpus on AArch64. Computing the interleave count based on VPlan allows for making better decisions in presence of VPlan optimizations, for example when operations on interleave groups are narrowed. Note that there are a few test changes for tests that were still checking the legacy cost-model output when it was computed in selectInterleaveCount.
1 parent e138c95 commit d6ce82b

File tree

6 files changed

+141
-79
lines changed

6 files changed

+141
-79
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,9 @@ class LoopVectorizationPlanner {
487487
/// all profitable VFs in ProfitableVFs.
488488
VectorizationFactor computeBestVF();
489489

490+
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
491+
InstructionCost LoopCost);
492+
490493
/// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
491494
/// according to the best selected \p VF and \p UF.
492495
///

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 72 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -955,13 +955,6 @@ class LoopVectorizationCostModel {
955955
/// 64 bit loop indices.
956956
std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
957957

958-
/// \return The desired interleave count.
959-
/// If interleave count has been specified by metadata it will be returned.
960-
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
961-
/// are the selected vectorization factor and the cost of the selected VF.
962-
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
963-
InstructionCost LoopCost);
964-
965958
/// Memory access instruction may be vectorized in more than one way.
966959
/// Form of instruction after vectorization depends on cost.
967960
/// This function takes cost-based decisions for Load/Store instructions
@@ -4606,8 +4599,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
46064599
}
46074600

46084601
unsigned
4609-
LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4610-
InstructionCost LoopCost) {
4602+
LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4603+
InstructionCost LoopCost) {
46114604
// -- The interleave heuristics --
46124605
// We interleave the loop in order to expose ILP and reduce the loop overhead.
46134606
// There are many micro-architectural considerations that we can't predict
@@ -4622,32 +4615,36 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
46224615
// 3. We don't interleave if we think that we will spill registers to memory
46234616
// due to the increased register pressure.
46244617

4625-
if (!isScalarEpilogueAllowed())
4618+
if (!CM.isScalarEpilogueAllowed())
46264619
return 1;
46274620

4628-
// Do not interleave if EVL is preferred and no User IC is specified.
4629-
if (foldTailWithEVL()) {
4621+
if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4622+
IsaPred<VPEVLBasedIVPHIRecipe>)) {
46304623
LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
46314624
"Unroll factor forced to be 1.\n");
46324625
return 1;
46334626
}
4634-
46354627
// We used the distance for the interleave count.
46364628
if (!Legal->isSafeForAnyVectorWidth())
46374629
return 1;
46384630

46394631
// We don't attempt to perform interleaving for loops with uncountable early
46404632
// exits because the VPInstruction::AnyOf code cannot currently handle
46414633
// multiple parts.
4642-
if (Legal->hasUncountableEarlyExit())
4634+
if (Plan.hasEarlyExit())
46434635
return 1;
46444636

4645-
const bool HasReductions = !Legal->getReductionVars().empty();
4637+
const bool HasReductions =
4638+
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4639+
IsaPred<VPReductionPHIRecipe>);
46464640

46474641
// If we did not calculate the cost for VF (because the user selected the VF)
46484642
// then we calculate the cost of VF here.
46494643
if (LoopCost == 0) {
4650-
LoopCost = expectedCost(VF);
4644+
if (VF.isScalar())
4645+
LoopCost = CM.expectedCost(VF);
4646+
else
4647+
LoopCost = cost(Plan, VF);
46514648
assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
46524649

46534650
// Loop body is free and there is no need for interleaving.
@@ -4656,7 +4653,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
46564653
}
46574654

46584655
VPRegisterUsage R =
4659-
calculateRegisterUsageForPlan(Plan, {VF}, TTI, ValuesToIgnore)[0];
4656+
calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
46604657
// We divide by these constants so assume that we have at least one
46614658
// instruction that uses at least one register.
46624659
for (auto &Pair : R.MaxLocalUsers) {
@@ -4717,21 +4714,21 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
47174714
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
47184715
}
47194716

4720-
unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
4717+
unsigned EstimatedVF = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
47214718

47224719
// Try to get the exact trip count, or an estimate based on profiling data or
47234720
// ConstantMax from PSE, failing that.
4724-
if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
4721+
if (auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop)) {
47254722
// At least one iteration must be scalar when this constraint holds. So the
47264723
// maximum available iterations for interleaving is one less.
4727-
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4724+
unsigned AvailableTC = CM.requiresScalarEpilogue(VF.isVector())
47284725
? BestKnownTC->getFixedValue() - 1
47294726
: BestKnownTC->getFixedValue();
47304727

47314728
unsigned InterleaveCountLB = bit_floor(std::max(
47324729
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
47334730

4734-
if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) {
4731+
if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
47354732
// If the best known trip count is exact, we select between two
47364733
// prospective ICs, where
47374734
//
@@ -4792,7 +4789,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
47924789
// vectorized the loop we will have done the runtime check and so interleaving
47934790
// won't require further checks.
47944791
bool ScalarInterleavingRequiresPredication =
4795-
(VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
4792+
(VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
47964793
return Legal->blockNeedsPredication(BB);
47974794
}));
47984795
bool ScalarInterleavingRequiresRuntimePointerCheck =
@@ -4815,8 +4812,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48154812

48164813
// Interleave until store/load ports (estimated by max interleave count) are
48174814
// saturated.
4818-
unsigned NumStores = Legal->getNumStores();
4819-
unsigned NumLoads = Legal->getNumLoads();
4815+
unsigned NumStores = 0;
4816+
unsigned NumLoads = 0;
4817+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4818+
vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
4819+
for (VPRecipeBase &R : *VPBB) {
4820+
if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) {
4821+
NumLoads++;
4822+
continue;
4823+
}
4824+
if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) {
4825+
NumStores++;
4826+
continue;
4827+
}
4828+
4829+
if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4830+
if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4831+
NumStores += StoreOps;
4832+
else
4833+
NumLoads += InterleaveR->getNumDefinedValues();
4834+
continue;
4835+
}
4836+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4837+
NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
4838+
NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
4839+
continue;
4840+
}
4841+
if (isa<VPHistogramRecipe>(&R)) {
4842+
NumLoads++;
4843+
NumStores++;
4844+
continue;
4845+
}
4846+
}
4847+
}
48204848
unsigned StoresIC = IC / (NumStores ? NumStores : 1);
48214849
unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
48224850

@@ -4826,12 +4854,15 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48264854
// do the final reduction after the loop.
48274855
bool HasSelectCmpReductions =
48284856
HasReductions &&
4829-
any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
4830-
const RecurrenceDescriptor &RdxDesc = Reduction.second;
4831-
RecurKind RK = RdxDesc.getRecurrenceKind();
4832-
return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
4833-
RecurrenceDescriptor::isFindIVRecurrenceKind(RK);
4834-
});
4857+
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4858+
[](VPRecipeBase &R) {
4859+
auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4860+
4861+
return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4862+
RedR->getRecurrenceKind()) ||
4863+
RecurrenceDescriptor::isFindIVRecurrenceKind(
4864+
RedR->getRecurrenceKind()));
4865+
});
48354866
if (HasSelectCmpReductions) {
48364867
LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
48374868
return 1;
@@ -4842,12 +4873,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48424873
// we're interleaving is inside another loop. For tree-wise reductions
48434874
// set the limit to 2, and for ordered reductions it's best to disable
48444875
// interleaving entirely.
4845-
if (HasReductions && TheLoop->getLoopDepth() > 1) {
4876+
if (HasReductions && OrigLoop->getLoopDepth() > 1) {
48464877
bool HasOrderedReductions =
4847-
any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
4848-
const RecurrenceDescriptor &RdxDesc = Reduction.second;
4849-
return RdxDesc.isOrdered();
4850-
});
4878+
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4879+
[](VPRecipeBase &R) {
4880+
auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4881+
4882+
return RedR && RedR->isOrdered();
4883+
});
48514884
if (HasOrderedReductions) {
48524885
LLVM_DEBUG(
48534886
dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
@@ -10066,8 +10099,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1006610099

1006710100
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
1006810101
if (LVP.hasPlanWithVF(VF.Width)) {
10102+
VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10103+
CM, CM.CostKind);
10104+
1006910105
// Select the interleave count.
10070-
IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
10106+
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
1007110107

1007210108
unsigned SelectedIC = std::max(IC, UserIC);
1007310109
// Optimistically generate runtime checks if they are needed. Drop them if
@@ -10078,8 +10114,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1007810114
// Check if it is profitable to vectorize with runtime checks.
1007910115
bool ForceVectorization =
1008010116
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10081-
VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10082-
CM, CM.CostKind);
1008310117
if (!ForceVectorization &&
1008410118
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
1008510119
LVP.getPlanFor(VF.Width), SEL,

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4215,7 +4215,10 @@ class VPlan {
42154215
/// block with multiple predecessors (one for the exit via the latch and one
42164216
/// via the other early exit).
42174217
bool hasEarlyExit() const {
4218-
return ExitBlocks.size() > 1 ||
4218+
return count_if(ExitBlocks,
4219+
[](VPIRBasicBlock *EB) {
4220+
return EB->getNumPredecessors() != 0;
4221+
}) > 1 ||
42194222
(ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1);
42204223
}
42214224

llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ target triple = "aarch64--linux-gnu"
1919
; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
2020
;
2121
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
22-
; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
22+
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3
2323
;
2424
define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) {
2525
entry:
@@ -60,7 +60,7 @@ for.end:
6060
; (store(4) + extractelement(4)) / 2 = 4
6161
;
6262
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
63-
; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
63+
; CHECK: Cost of 4 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4
6464
;
6565
define void @predicated_store(ptr %a, i1 %c, i32 %x, i64 %n) {
6666
entry:
@@ -93,8 +93,8 @@ for.end:
9393
; CHECK: Found scalar instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
9494
; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, ptr %addr, i64 1
9595
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %addr, align 4
96-
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
97-
; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4
96+
; CHECK: Cost of 0 for VF 2: induction instruction %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
97+
; CHECK: Cost of 4 for VF 2: profitable to scalarize store i32 %tmp2, ptr %addr, align 4
9898
;
9999
define void @predicated_store_phi(ptr %a, i1 %c, i32 %x, i64 %n) {
100100
entry:
@@ -135,9 +135,10 @@ for.end:
135135
;
136136
; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
137137
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
138-
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
139-
; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
138+
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp3 = add nsw i32 %tmp2, %x
139+
; CHECK: Cost of 5 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3
140140
;
141+
141142
define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
142143
entry:
143144
br label %for.body
@@ -180,8 +181,8 @@ for.end:
180181
;
181182
; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
182183
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
183-
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
184-
; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
184+
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp2 = add nsw i32 %tmp1, %x
185+
; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4
185186
;
186187
define void @predicated_store_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
187188
entry:
@@ -232,11 +233,11 @@ for.end:
232233
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
233234
; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x
234235
; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
235-
; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
236-
; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
237-
; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
238-
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
239-
; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, ptr %tmp0, align 4
236+
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp3, %tmp2
237+
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp3 = sdiv i32 %tmp1, %tmp2
238+
; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp5, ptr %tmp0, align 4
239+
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp5 = sub i32 %tmp4, %x
240+
; CHECK: Cost of 1 for VF 2: WIDEN ir<%tmp2> = add ir<%tmp1>, ir<%x>
240241
;
241242
define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) {
242243
entry:

0 commit comments

Comments
 (0)