@@ -955,13 +955,6 @@ class LoopVectorizationCostModel {
955
955
// / 64 bit loop indices.
956
956
std::pair<unsigned , unsigned > getSmallestAndWidestTypes ();
957
957
958
- // / \return The desired interleave count.
959
- // / If interleave count has been specified by metadata it will be returned.
960
- // / Otherwise, the interleave count is computed and returned. VF and LoopCost
961
- // / are the selected vectorization factor and the cost of the selected VF.
962
- unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
963
- InstructionCost LoopCost);
964
-
965
958
// / Memory access instruction may be vectorized in more than one way.
966
959
// / Form of instruction after vectorization depends on cost.
967
960
// / This function takes cost-based decisions for Load/Store instructions
@@ -4606,8 +4599,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
4606
4599
}
4607
4600
4608
4601
unsigned
4609
- LoopVectorizationCostModel ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4610
- InstructionCost LoopCost) {
4602
+ LoopVectorizationPlanner ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4603
+ InstructionCost LoopCost) {
4611
4604
// -- The interleave heuristics --
4612
4605
// We interleave the loop in order to expose ILP and reduce the loop overhead.
4613
4606
// There are many micro-architectural considerations that we can't predict
@@ -4622,32 +4615,36 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4622
4615
// 3. We don't interleave if we think that we will spill registers to memory
4623
4616
// due to the increased register pressure.
4624
4617
4625
- if (!isScalarEpilogueAllowed ())
4618
+ if (!CM. isScalarEpilogueAllowed ())
4626
4619
return 1 ;
4627
4620
4628
- // Do not interleave if EVL is preferred and no User IC is specified.
4629
- if ( foldTailWithEVL ( )) {
4621
+ if ( any_of (Plan. getVectorLoopRegion ()-> getEntryBasicBlock ()-> phis (),
4622
+ IsaPred<VPEVLBasedIVPHIRecipe> )) {
4630
4623
LLVM_DEBUG (dbgs () << " LV: Preference for VP intrinsics indicated. "
4631
4624
" Unroll factor forced to be 1.\n " );
4632
4625
return 1 ;
4633
4626
}
4634
-
4635
4627
// We used the distance for the interleave count.
4636
4628
if (!Legal->isSafeForAnyVectorWidth ())
4637
4629
return 1 ;
4638
4630
4639
4631
// We don't attempt to perform interleaving for loops with uncountable early
4640
4632
// exits because the VPInstruction::AnyOf code cannot currently handle
4641
4633
// multiple parts.
4642
- if (Legal-> hasUncountableEarlyExit ())
4634
+ if (Plan. hasEarlyExit ())
4643
4635
return 1 ;
4644
4636
4645
- const bool HasReductions = !Legal->getReductionVars ().empty ();
4637
+ const bool HasReductions =
4638
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4639
+ IsaPred<VPReductionPHIRecipe>);
4646
4640
4647
4641
// If we did not calculate the cost for VF (because the user selected the VF)
4648
4642
// then we calculate the cost of VF here.
4649
4643
if (LoopCost == 0 ) {
4650
- LoopCost = expectedCost (VF);
4644
+ if (VF.isScalar ())
4645
+ LoopCost = CM.expectedCost (VF);
4646
+ else
4647
+ LoopCost = cost (Plan, VF);
4651
4648
assert (LoopCost.isValid () && " Expected to have chosen a VF with valid cost" );
4652
4649
4653
4650
// Loop body is free and there is no need for interleaving.
@@ -4656,7 +4653,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4656
4653
}
4657
4654
4658
4655
VPRegisterUsage R =
4659
- calculateRegisterUsageForPlan (Plan, {VF}, TTI, ValuesToIgnore)[0 ];
4656
+ calculateRegisterUsageForPlan (Plan, {VF}, TTI, CM. ValuesToIgnore )[0 ];
4660
4657
// We divide by these constants so assume that we have at least one
4661
4658
// instruction that uses at least one register.
4662
4659
for (auto &Pair : R.MaxLocalUsers ) {
@@ -4717,21 +4714,21 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4717
4714
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4718
4715
}
4719
4716
4720
- unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning );
4717
+ unsigned EstimatedVF = getEstimatedRuntimeVF (VF, CM. getVScaleForTuning () );
4721
4718
4722
4719
// Try to get the exact trip count, or an estimate based on profiling data or
4723
4720
// ConstantMax from PSE, failing that.
4724
- if (auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop )) {
4721
+ if (auto BestKnownTC = getSmallBestKnownTC (PSE, OrigLoop )) {
4725
4722
// At least one iteration must be scalar when this constraint holds. So the
4726
4723
// maximum available iterations for interleaving is one less.
4727
- unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
4724
+ unsigned AvailableTC = CM. requiresScalarEpilogue (VF.isVector ())
4728
4725
? BestKnownTC->getFixedValue () - 1
4729
4726
: BestKnownTC->getFixedValue ();
4730
4727
4731
4728
unsigned InterleaveCountLB = bit_floor (std::max (
4732
4729
1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
4733
4730
4734
- if (getSmallConstantTripCount (PSE.getSE (), TheLoop ).isNonZero ()) {
4731
+ if (getSmallConstantTripCount (PSE.getSE (), OrigLoop ).isNonZero ()) {
4735
4732
// If the best known trip count is exact, we select between two
4736
4733
// prospective ICs, where
4737
4734
//
@@ -4792,7 +4789,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4792
4789
// vectorized the loop we will have done the runtime check and so interleaving
4793
4790
// won't require further checks.
4794
4791
bool ScalarInterleavingRequiresPredication =
4795
- (VF.isScalar () && any_of (TheLoop ->blocks (), [this ](BasicBlock *BB) {
4792
+ (VF.isScalar () && any_of (OrigLoop ->blocks (), [this ](BasicBlock *BB) {
4796
4793
return Legal->blockNeedsPredication (BB);
4797
4794
}));
4798
4795
bool ScalarInterleavingRequiresRuntimePointerCheck =
@@ -4815,8 +4812,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4815
4812
4816
4813
// Interleave until store/load ports (estimated by max interleave count) are
4817
4814
// saturated.
4818
- unsigned NumStores = Legal->getNumStores ();
4819
- unsigned NumLoads = Legal->getNumLoads ();
4815
+ unsigned NumStores = 0 ;
4816
+ unsigned NumLoads = 0 ;
4817
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4818
+ vp_depth_first_deep (Plan.getVectorLoopRegion ()->getEntry ()))) {
4819
+ for (VPRecipeBase &R : *VPBB) {
4820
+ if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) {
4821
+ NumLoads++;
4822
+ continue ;
4823
+ }
4824
+ if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) {
4825
+ NumStores++;
4826
+ continue ;
4827
+ }
4828
+
4829
+ if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4830
+ if (unsigned StoreOps = InterleaveR->getNumStoreOperands ())
4831
+ NumStores += StoreOps;
4832
+ else
4833
+ NumLoads += InterleaveR->getNumDefinedValues ();
4834
+ continue ;
4835
+ }
4836
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4837
+ NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr ());
4838
+ NumStores += isa<StoreInst>(RepR->getUnderlyingInstr ());
4839
+ continue ;
4840
+ }
4841
+ if (isa<VPHistogramRecipe>(&R)) {
4842
+ NumLoads++;
4843
+ NumStores++;
4844
+ continue ;
4845
+ }
4846
+ }
4847
+ }
4820
4848
unsigned StoresIC = IC / (NumStores ? NumStores : 1 );
4821
4849
unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1 );
4822
4850
@@ -4826,12 +4854,15 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4826
4854
// do the final reduction after the loop.
4827
4855
bool HasSelectCmpReductions =
4828
4856
HasReductions &&
4829
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4830
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4831
- RecurKind RK = RdxDesc.getRecurrenceKind ();
4832
- return RecurrenceDescriptor::isAnyOfRecurrenceKind (RK) ||
4833
- RecurrenceDescriptor::isFindIVRecurrenceKind (RK);
4834
- });
4857
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4858
+ [](VPRecipeBase &R) {
4859
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4860
+
4861
+ return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind (
4862
+ RedR->getRecurrenceKind ()) ||
4863
+ RecurrenceDescriptor::isFindIVRecurrenceKind (
4864
+ RedR->getRecurrenceKind ()));
4865
+ });
4835
4866
if (HasSelectCmpReductions) {
4836
4867
LLVM_DEBUG (dbgs () << " LV: Not interleaving select-cmp reductions.\n " );
4837
4868
return 1 ;
@@ -4842,12 +4873,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4842
4873
// we're interleaving is inside another loop. For tree-wise reductions
4843
4874
// set the limit to 2, and for ordered reductions it's best to disable
4844
4875
// interleaving entirely.
4845
- if (HasReductions && TheLoop ->getLoopDepth () > 1 ) {
4876
+ if (HasReductions && OrigLoop ->getLoopDepth () > 1 ) {
4846
4877
bool HasOrderedReductions =
4847
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4848
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4849
- return RdxDesc.isOrdered ();
4850
- });
4878
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4879
+ [](VPRecipeBase &R) {
4880
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4881
+
4882
+ return RedR && RedR->isOrdered ();
4883
+ });
4851
4884
if (HasOrderedReductions) {
4852
4885
LLVM_DEBUG (
4853
4886
dbgs () << " LV: Not interleaving scalar ordered reductions.\n " );
@@ -10066,8 +10099,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10066
10099
10067
10100
GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
10068
10101
if (LVP.hasPlanWithVF (VF.Width )) {
10102
+ VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10103
+ CM, CM.CostKind );
10104
+
10069
10105
// Select the interleave count.
10070
- IC = CM .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10106
+ IC = LVP .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10071
10107
10072
10108
unsigned SelectedIC = std::max (IC, UserIC);
10073
10109
// Optimistically generate runtime checks if they are needed. Drop them if
@@ -10078,8 +10114,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10078
10114
// Check if it is profitable to vectorize with runtime checks.
10079
10115
bool ForceVectorization =
10080
10116
Hints.getForce () == LoopVectorizeHints::FK_Enabled;
10081
- VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10082
- CM, CM.CostKind );
10083
10117
if (!ForceVectorization &&
10084
10118
!isOutsideLoopWorkProfitable (Checks, VF, L, PSE, CostCtx,
10085
10119
LVP.getPlanFor (VF.Width ), SEL,
0 commit comments