@@ -955,13 +955,6 @@ class LoopVectorizationCostModel {
955
955
// / 64 bit loop indices.
956
956
std::pair<unsigned , unsigned > getSmallestAndWidestTypes ();
957
957
958
- // / \return The desired interleave count.
959
- // / If interleave count has been specified by metadata it will be returned.
960
- // / Otherwise, the interleave count is computed and returned. VF and LoopCost
961
- // / are the selected vectorization factor and the cost of the selected VF.
962
- unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
963
- InstructionCost LoopCost);
964
-
965
958
// / Memory access instruction may be vectorized in more than one way.
966
959
// / Form of instruction after vectorization depends on cost.
967
960
// / This function takes cost-based decisions for Load/Store instructions
@@ -4611,8 +4604,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
4611
4604
}
4612
4605
4613
4606
unsigned
4614
- LoopVectorizationCostModel ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4615
- InstructionCost LoopCost) {
4607
+ LoopVectorizationPlanner ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4608
+ InstructionCost LoopCost) {
4616
4609
// -- The interleave heuristics --
4617
4610
// We interleave the loop in order to expose ILP and reduce the loop overhead.
4618
4611
// There are many micro-architectural considerations that we can't predict
@@ -4627,11 +4620,11 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4627
4620
// 3. We don't interleave if we think that we will spill registers to memory
4628
4621
// due to the increased register pressure.
4629
4622
4630
- if (!isScalarEpilogueAllowed ())
4623
+ if (!CM. isScalarEpilogueAllowed ())
4631
4624
return 1 ;
4632
4625
4633
- // Do not interleave if EVL is preferred and no User IC is specified.
4634
- if ( foldTailWithEVL ( )) {
4626
+ if ( any_of (Plan. getVectorLoopRegion ()-> getEntryBasicBlock ()-> phis (),
4627
+ IsaPred<VPEVLBasedIVPHIRecipe> )) {
4635
4628
LLVM_DEBUG (dbgs () << " LV: Preference for VP intrinsics indicated. "
4636
4629
" Unroll factor forced to be 1.\n " );
4637
4630
return 1 ;
@@ -4644,15 +4637,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4644
4637
// We don't attempt to perform interleaving for loops with uncountable early
4645
4638
// exits because the VPInstruction::AnyOf code cannot currently handle
4646
4639
// multiple parts.
4647
- if (Legal-> hasUncountableEarlyExit ())
4640
+ if (Plan. hasEarlyExit ())
4648
4641
return 1 ;
4649
4642
4650
- const bool HasReductions = !Legal->getReductionVars ().empty ();
4643
+ const bool HasReductions =
4644
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4645
+ IsaPred<VPReductionPHIRecipe>);
4651
4646
4652
4647
// If we did not calculate the cost for VF (because the user selected the VF)
4653
4648
// then we calculate the cost of VF here.
4654
4649
if (LoopCost == 0 ) {
4655
- LoopCost = expectedCost (VF);
4650
+ if (VF.isScalar ())
4651
+ LoopCost = CM.expectedCost (VF);
4652
+ else
4653
+ LoopCost = cost (Plan, VF);
4656
4654
assert (LoopCost.isValid () && " Expected to have chosen a VF with valid cost" );
4657
4655
4658
4656
// Loop body is free and there is no need for interleaving.
@@ -4661,7 +4659,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4661
4659
}
4662
4660
4663
4661
VPRegisterUsage R =
4664
- calculateRegisterUsageForPlan (Plan, {VF}, TTI, ValuesToIgnore)[0 ];
4662
+ calculateRegisterUsageForPlan (Plan, {VF}, TTI, CM. ValuesToIgnore )[0 ];
4665
4663
// We divide by these constants so assume that we have at least one
4666
4664
// instruction that uses at least one register.
4667
4665
for (auto &Pair : R.MaxLocalUsers ) {
@@ -4722,21 +4720,21 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4722
4720
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4723
4721
}
4724
4722
4725
- unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning );
4723
+ unsigned EstimatedVF = getEstimatedRuntimeVF (VF, CM. getVScaleForTuning () );
4726
4724
4727
4725
// Try to get the exact trip count, or an estimate based on profiling data or
4728
4726
// ConstantMax from PSE, failing that.
4729
- if (auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop )) {
4727
+ if (auto BestKnownTC = getSmallBestKnownTC (PSE, OrigLoop )) {
4730
4728
// At least one iteration must be scalar when this constraint holds. So the
4731
4729
// maximum available iterations for interleaving is one less.
4732
- unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
4730
+ unsigned AvailableTC = CM. requiresScalarEpilogue (VF.isVector ())
4733
4731
? BestKnownTC->getFixedValue () - 1
4734
4732
: BestKnownTC->getFixedValue ();
4735
4733
4736
4734
unsigned InterleaveCountLB = bit_floor (std::max (
4737
4735
1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
4738
4736
4739
- if (getSmallConstantTripCount (PSE.getSE (), TheLoop ).isNonZero ()) {
4737
+ if (getSmallConstantTripCount (PSE.getSE (), OrigLoop ).isNonZero ()) {
4740
4738
// If the best known trip count is exact, we select between two
4741
4739
// prospective ICs, where
4742
4740
//
@@ -4797,7 +4795,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4797
4795
// vectorized the loop we will have done the runtime check and so interleaving
4798
4796
// won't require further checks.
4799
4797
bool ScalarInterleavingRequiresPredication =
4800
- (VF.isScalar () && any_of (TheLoop ->blocks (), [this ](BasicBlock *BB) {
4798
+ (VF.isScalar () && any_of (OrigLoop ->blocks (), [this ](BasicBlock *BB) {
4801
4799
return Legal->blockNeedsPredication (BB);
4802
4800
}));
4803
4801
bool ScalarInterleavingRequiresRuntimePointerCheck =
@@ -4820,8 +4818,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4820
4818
4821
4819
// Interleave until store/load ports (estimated by max interleave count) are
4822
4820
// saturated.
4823
- unsigned NumStores = Legal->getNumStores ();
4824
- unsigned NumLoads = Legal->getNumLoads ();
4821
+ unsigned NumStores = 0 ;
4822
+ unsigned NumLoads = 0 ;
4823
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4824
+ vp_depth_first_deep (Plan.getVectorLoopRegion ()->getEntry ()))) {
4825
+ for (VPRecipeBase &R : *VPBB) {
4826
+ if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) {
4827
+ NumLoads++;
4828
+ continue ;
4829
+ }
4830
+ if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) {
4831
+ NumStores++;
4832
+ continue ;
4833
+ }
4834
+
4835
+ if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4836
+ if (unsigned StoreOps = InterleaveR->getNumStoreOperands ())
4837
+ NumStores += StoreOps;
4838
+ else
4839
+ NumLoads += InterleaveR->getNumDefinedValues ();
4840
+ continue ;
4841
+ }
4842
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4843
+ NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr ());
4844
+ NumStores += isa<StoreInst>(RepR->getUnderlyingInstr ());
4845
+ continue ;
4846
+ }
4847
+ if (isa<VPHistogramRecipe>(&R)) {
4848
+ NumLoads++;
4849
+ NumStores++;
4850
+ continue ;
4851
+ }
4852
+ }
4853
+ }
4825
4854
unsigned StoresIC = IC / (NumStores ? NumStores : 1 );
4826
4855
unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1 );
4827
4856
@@ -4831,12 +4860,15 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4831
4860
// do the final reduction after the loop.
4832
4861
bool HasSelectCmpReductions =
4833
4862
HasReductions &&
4834
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4835
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4836
- RecurKind RK = RdxDesc.getRecurrenceKind ();
4837
- return RecurrenceDescriptor::isAnyOfRecurrenceKind (RK) ||
4838
- RecurrenceDescriptor::isFindIVRecurrenceKind (RK);
4839
- });
4863
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4864
+ [](VPRecipeBase &R) {
4865
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4866
+
4867
+ return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind (
4868
+ RedR->getRecurrenceKind ()) ||
4869
+ RecurrenceDescriptor::isFindIVRecurrenceKind (
4870
+ RedR->getRecurrenceKind ()));
4871
+ });
4840
4872
if (HasSelectCmpReductions) {
4841
4873
LLVM_DEBUG (dbgs () << " LV: Not interleaving select-cmp reductions.\n " );
4842
4874
return 1 ;
@@ -4847,12 +4879,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4847
4879
// we're interleaving is inside another loop. For tree-wise reductions
4848
4880
// set the limit to 2, and for ordered reductions it's best to disable
4849
4881
// interleaving entirely.
4850
- if (HasReductions && TheLoop ->getLoopDepth () > 1 ) {
4882
+ if (HasReductions && OrigLoop ->getLoopDepth () > 1 ) {
4851
4883
bool HasOrderedReductions =
4852
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4853
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4854
- return RdxDesc.isOrdered ();
4855
- });
4884
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4885
+ [](VPRecipeBase &R) {
4886
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4887
+
4888
+ return RedR && RedR->isOrdered ();
4889
+ });
4856
4890
if (HasOrderedReductions) {
4857
4891
LLVM_DEBUG (
4858
4892
dbgs () << " LV: Not interleaving scalar ordered reductions.\n " );
@@ -10071,8 +10105,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10071
10105
10072
10106
GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
10073
10107
if (LVP.hasPlanWithVF (VF.Width )) {
10108
+ VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10109
+ CM, CM.CostKind );
10110
+
10074
10111
// Select the interleave count.
10075
- IC = CM .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10112
+ IC = LVP .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10076
10113
10077
10114
unsigned SelectedIC = std::max (IC, UserIC);
10078
10115
// Optimistically generate runtime checks if they are needed. Drop them if
@@ -10083,8 +10120,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10083
10120
// Check if it is profitable to vectorize with runtime checks.
10084
10121
bool ForceVectorization =
10085
10122
Hints.getForce () == LoopVectorizeHints::FK_Enabled;
10086
- VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10087
- CM, CM.CostKind );
10088
10123
if (!ForceVectorization &&
10089
10124
!isOutsideLoopWorkProfitable (Checks, VF, L, PSE, CostCtx,
10090
10125
LVP.getPlanFor (VF.Width ), SEL,
0 commit comments