@@ -955,13 +955,6 @@ class LoopVectorizationCostModel {
955
955
// / 64 bit loop indices.
956
956
std::pair<unsigned , unsigned > getSmallestAndWidestTypes ();
957
957
958
- // / \return The desired interleave count.
959
- // / If interleave count has been specified by metadata it will be returned.
960
- // / Otherwise, the interleave count is computed and returned. VF and LoopCost
961
- // / are the selected vectorization factor and the cost of the selected VF.
962
- unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
963
- InstructionCost LoopCost);
964
-
965
958
// / Memory access instruction may be vectorized in more than one way.
966
959
// / Form of instruction after vectorization depends on cost.
967
960
// / This function takes cost-based decisions for Load/Store instructions
@@ -4611,8 +4604,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
4611
4604
}
4612
4605
4613
4606
unsigned
4614
- LoopVectorizationCostModel ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4615
- InstructionCost LoopCost) {
4607
+ LoopVectorizationPlanner ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4608
+ InstructionCost LoopCost) {
4616
4609
// -- The interleave heuristics --
4617
4610
// We interleave the loop in order to expose ILP and reduce the loop overhead.
4618
4611
// There are many micro-architectural considerations that we can't predict
@@ -4627,32 +4620,36 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4627
4620
// 3. We don't interleave if we think that we will spill registers to memory
4628
4621
// due to the increased register pressure.
4629
4622
4630
- if (!isScalarEpilogueAllowed ())
4623
+ if (!CM. isScalarEpilogueAllowed ())
4631
4624
return 1 ;
4632
4625
4633
- // Do not interleave if EVL is preferred and no User IC is specified.
4634
- if ( foldTailWithEVL ( )) {
4626
+ if ( any_of (Plan. getVectorLoopRegion ()-> getEntryBasicBlock ()-> phis (),
4627
+ IsaPred<VPEVLBasedIVPHIRecipe> )) {
4635
4628
LLVM_DEBUG (dbgs () << " LV: Preference for VP intrinsics indicated. "
4636
4629
" Unroll factor forced to be 1.\n " );
4637
4630
return 1 ;
4638
4631
}
4639
-
4640
4632
// We used the distance for the interleave count.
4641
4633
if (!Legal->isSafeForAnyVectorWidth ())
4642
4634
return 1 ;
4643
4635
4644
4636
// We don't attempt to perform interleaving for loops with uncountable early
4645
4637
// exits because the VPInstruction::AnyOf code cannot currently handle
4646
4638
// multiple parts.
4647
- if (Legal-> hasUncountableEarlyExit ())
4639
+ if (Plan. hasEarlyExit ())
4648
4640
return 1 ;
4649
4641
4650
- const bool HasReductions = !Legal->getReductionVars ().empty ();
4642
+ const bool HasReductions =
4643
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4644
+ IsaPred<VPReductionPHIRecipe>);
4651
4645
4652
4646
// If we did not calculate the cost for VF (because the user selected the VF)
4653
4647
// then we calculate the cost of VF here.
4654
4648
if (LoopCost == 0 ) {
4655
- LoopCost = expectedCost (VF);
4649
+ if (VF.isScalar ())
4650
+ LoopCost = CM.expectedCost (VF);
4651
+ else
4652
+ LoopCost = cost (Plan, VF);
4656
4653
assert (LoopCost.isValid () && " Expected to have chosen a VF with valid cost" );
4657
4654
4658
4655
// Loop body is free and there is no need for interleaving.
@@ -4661,7 +4658,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4661
4658
}
4662
4659
4663
4660
VPRegisterUsage R =
4664
- calculateRegisterUsageForPlan (Plan, {VF}, TTI, ValuesToIgnore)[0 ];
4661
+ calculateRegisterUsageForPlan (Plan, {VF}, TTI, CM. ValuesToIgnore )[0 ];
4665
4662
// We divide by these constants so assume that we have at least one
4666
4663
// instruction that uses at least one register.
4667
4664
for (auto &Pair : R.MaxLocalUsers ) {
@@ -4722,21 +4719,21 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4722
4719
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4723
4720
}
4724
4721
4725
- unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning );
4722
+ unsigned EstimatedVF = getEstimatedRuntimeVF (VF, CM. getVScaleForTuning () );
4726
4723
4727
4724
// Try to get the exact trip count, or an estimate based on profiling data or
4728
4725
// ConstantMax from PSE, failing that.
4729
- if (auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop )) {
4726
+ if (auto BestKnownTC = getSmallBestKnownTC (PSE, OrigLoop )) {
4730
4727
// At least one iteration must be scalar when this constraint holds. So the
4731
4728
// maximum available iterations for interleaving is one less.
4732
- unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
4729
+ unsigned AvailableTC = CM. requiresScalarEpilogue (VF.isVector ())
4733
4730
? BestKnownTC->getFixedValue () - 1
4734
4731
: BestKnownTC->getFixedValue ();
4735
4732
4736
4733
unsigned InterleaveCountLB = bit_floor (std::max (
4737
4734
1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
4738
4735
4739
- if (getSmallConstantTripCount (PSE.getSE (), TheLoop ).isNonZero ()) {
4736
+ if (getSmallConstantTripCount (PSE.getSE (), OrigLoop ).isNonZero ()) {
4740
4737
// If the best known trip count is exact, we select between two
4741
4738
// prospective ICs, where
4742
4739
//
@@ -4797,7 +4794,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4797
4794
// vectorized the loop we will have done the runtime check and so interleaving
4798
4795
// won't require further checks.
4799
4796
bool ScalarInterleavingRequiresPredication =
4800
- (VF.isScalar () && any_of (TheLoop ->blocks (), [this ](BasicBlock *BB) {
4797
+ (VF.isScalar () && any_of (OrigLoop ->blocks (), [this ](BasicBlock *BB) {
4801
4798
return Legal->blockNeedsPredication (BB);
4802
4799
}));
4803
4800
bool ScalarInterleavingRequiresRuntimePointerCheck =
@@ -4820,8 +4817,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4820
4817
4821
4818
// Interleave until store/load ports (estimated by max interleave count) are
4822
4819
// saturated.
4823
- unsigned NumStores = Legal->getNumStores ();
4824
- unsigned NumLoads = Legal->getNumLoads ();
4820
+ unsigned NumStores = 0 ;
4821
+ unsigned NumLoads = 0 ;
4822
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4823
+ vp_depth_first_deep (Plan.getVectorLoopRegion ()->getEntry ()))) {
4824
+ for (VPRecipeBase &R : *VPBB) {
4825
+ if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) {
4826
+ NumLoads++;
4827
+ continue ;
4828
+ }
4829
+ if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) {
4830
+ NumStores++;
4831
+ continue ;
4832
+ }
4833
+
4834
+ if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4835
+ if (unsigned StoreOps = InterleaveR->getNumStoreOperands ())
4836
+ NumStores += StoreOps;
4837
+ else
4838
+ NumLoads += InterleaveR->getNumDefinedValues ();
4839
+ continue ;
4840
+ }
4841
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4842
+ NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr ());
4843
+ NumStores += isa<StoreInst>(RepR->getUnderlyingInstr ());
4844
+ continue ;
4845
+ }
4846
+ if (isa<VPHistogramRecipe>(&R)) {
4847
+ NumLoads++;
4848
+ NumStores++;
4849
+ continue ;
4850
+ }
4851
+ }
4852
+ }
4825
4853
unsigned StoresIC = IC / (NumStores ? NumStores : 1 );
4826
4854
unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1 );
4827
4855
@@ -4831,12 +4859,15 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4831
4859
// do the final reduction after the loop.
4832
4860
bool HasSelectCmpReductions =
4833
4861
HasReductions &&
4834
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4835
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4836
- RecurKind RK = RdxDesc.getRecurrenceKind ();
4837
- return RecurrenceDescriptor::isAnyOfRecurrenceKind (RK) ||
4838
- RecurrenceDescriptor::isFindIVRecurrenceKind (RK);
4839
- });
4862
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4863
+ [](VPRecipeBase &R) {
4864
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4865
+
4866
+ return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind (
4867
+ RedR->getRecurrenceKind ()) ||
4868
+ RecurrenceDescriptor::isFindIVRecurrenceKind (
4869
+ RedR->getRecurrenceKind ()));
4870
+ });
4840
4871
if (HasSelectCmpReductions) {
4841
4872
LLVM_DEBUG (dbgs () << " LV: Not interleaving select-cmp reductions.\n " );
4842
4873
return 1 ;
@@ -4847,12 +4878,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4847
4878
// we're interleaving is inside another loop. For tree-wise reductions
4848
4879
// set the limit to 2, and for ordered reductions it's best to disable
4849
4880
// interleaving entirely.
4850
- if (HasReductions && TheLoop ->getLoopDepth () > 1 ) {
4881
+ if (HasReductions && OrigLoop ->getLoopDepth () > 1 ) {
4851
4882
bool HasOrderedReductions =
4852
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4853
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4854
- return RdxDesc.isOrdered ();
4855
- });
4883
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4884
+ [](VPRecipeBase &R) {
4885
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4886
+
4887
+ return RedR && RedR->isOrdered ();
4888
+ });
4856
4889
if (HasOrderedReductions) {
4857
4890
LLVM_DEBUG (
4858
4891
dbgs () << " LV: Not interleaving scalar ordered reductions.\n " );
@@ -10071,8 +10104,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10071
10104
10072
10105
GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
10073
10106
if (LVP.hasPlanWithVF (VF.Width )) {
10107
+ VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10108
+ CM, CM.CostKind );
10109
+
10074
10110
// Select the interleave count.
10075
- IC = CM .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10111
+ IC = LVP .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10076
10112
10077
10113
unsigned SelectedIC = std::max (IC, UserIC);
10078
10114
// Optimistically generate runtime checks if they are needed. Drop them if
@@ -10083,8 +10119,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10083
10119
// Check if it is profitable to vectorize with runtime checks.
10084
10120
bool ForceVectorization =
10085
10121
Hints.getForce () == LoopVectorizeHints::FK_Enabled;
10086
- VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10087
- CM, CM.CostKind );
10088
10122
if (!ForceVectorization &&
10089
10123
!isOutsideLoopWorkProfitable (Checks, VF, L, PSE, CostCtx,
10090
10124
LVP.getPlanFor (VF.Width ), SEL,
0 commit comments