diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index ea0fa0668ef6b..48aaafe094a31 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -486,6 +486,13 @@ class LoopVectorizationPlanner { /// all profitable VFs in ProfitableVFs. VectorizationFactor computeBestVF(); + /// \return The desired interleave count. + /// If interleave count has been specified by metadata it will be returned. + /// Otherwise, the interleave count is computed and returned. VF and LoopCost + /// are the selected vectorization factor and the cost of the selected VF. + unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, + InstructionCost LoopCost); + /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan /// according to the best selected \p VF and \p UF. /// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 20528089c0008..de0c15ee880d5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -974,13 +974,6 @@ class LoopVectorizationCostModel { /// 64 bit loop indices. std::pair getSmallestAndWidestTypes(); - /// \return The desired interleave count. - /// If interleave count has been specified by metadata it will be returned. - /// Otherwise, the interleave count is computed and returned. VF and LoopCost - /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, - InstructionCost LoopCost); - /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. /// This function takes cost-based decisions for Load/Store instructions @@ -4653,8 +4646,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { } unsigned -LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, - InstructionCost LoopCost) { +LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, + InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict @@ -4669,11 +4662,11 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!isScalarEpilogueAllowed()) + if (!CM.isScalarEpilogueAllowed()) return 1; - // Do not interleave if EVL is preferred and no User IC is specified. - if (foldTailWithEVL()) { + if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + IsaPred)) { LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " "Unroll factor forced to be 1.\n"); return 1; @@ -4686,15 +4679,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // We don't attempt to perform interleaving for loops with uncountable early // exits because the VPInstruction::AnyOf code cannot currently handle // multiple parts. - if (Legal->hasUncountableEarlyExit()) + if (Plan.hasEarlyExit()) return 1; - const bool HasReductions = !Legal->getReductionVars().empty(); + const bool HasReductions = + any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + IsaPred); // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { - LoopCost = expectedCost(VF); + if (VF.isScalar()) + LoopCost = CM.expectedCost(VF); + else + LoopCost = cost(Plan, VF); assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); // Loop body is free and there is no need for interleaving. @@ -4703,7 +4701,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, } VPRegisterUsage R = - calculateRegisterUsageForPlan(Plan, {VF}, TTI, ValuesToIgnore)[0]; + calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. for (auto &Pair : R.MaxLocalUsers) { @@ -4766,23 +4764,24 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // Try to get the exact trip count, or an estimate based on profiling data or // ConstantMax from PSE, failing that. - auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); + auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop); // For fixed length VFs treat a scalable trip count as unknown. if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) { // Re-evaluate trip counts and VFs to be in the same numerical space. - unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning); - unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning); + unsigned AvailableTC = + estimateElementCount(*BestKnownTC, CM.getVScaleForTuning()); + unsigned EstimatedVF = estimateElementCount(VF, CM.getVScaleForTuning()); // At least one iteration must be scalar when this constraint holds. So the // maximum available iterations for interleaving is one less. - if (requiresScalarEpilogue(VF.isVector())) + if (CM.requiresScalarEpilogue(VF.isVector())) --AvailableTC; unsigned InterleaveCountLB = bit_floor(std::max( 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); - if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) { + if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) { // If the best known trip count is exact, we select between two // prospective ICs, where // @@ -4843,7 +4842,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // vectorized the loop we will have done the runtime check and so interleaving // won't require further checks. bool ScalarInterleavingRequiresPredication = - (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { + (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); })); bool ScalarInterleavingRequiresRuntimePointerCheck = @@ -4866,8 +4865,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // Interleave until store/load ports (estimated by max interleave count) are // saturated. - unsigned NumStores = Legal->getNumStores(); - unsigned NumLoads = Legal->getNumLoads(); + unsigned NumStores = 0; + unsigned NumLoads = 0; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + if (isa(&R)) { + NumLoads++; + continue; + } + if (isa(&R)) { + NumStores++; + continue; + } + + if (auto *InterleaveR = dyn_cast(&R)) { + if (unsigned StoreOps = InterleaveR->getNumStoreOperands()) + NumStores += StoreOps; + else + NumLoads += InterleaveR->getNumDefinedValues(); + continue; + } + if (auto *RepR = dyn_cast(&R)) { + NumLoads += isa(RepR->getUnderlyingInstr()); + NumStores += isa(RepR->getUnderlyingInstr()); + continue; + } + if (isa(&R)) { + NumLoads++; + NumStores++; + continue; + } + } + } unsigned StoresIC = IC / (NumStores ? NumStores : 1); unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); @@ -4877,12 +4907,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // do the final reduction after the loop. bool HasSelectCmpReductions = HasReductions && - any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { - const RecurrenceDescriptor &RdxDesc = Reduction.second; - RecurKind RK = RdxDesc.getRecurrenceKind(); - return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || - RecurrenceDescriptor::isFindIVRecurrenceKind(RK); - }); + any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + [](VPRecipeBase &R) { + auto *RedR = dyn_cast(&R); + return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RedR->getRecurrenceKind()) || + RecurrenceDescriptor::isFindIVRecurrenceKind( + RedR->getRecurrenceKind())); + }); if (HasSelectCmpReductions) { LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); return 1; @@ -4893,12 +4925,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // we're interleaving is inside another loop. For tree-wise reductions // set the limit to 2, and for ordered reductions it's best to disable // interleaving entirely. - if (HasReductions && TheLoop->getLoopDepth() > 1) { + if (HasReductions && OrigLoop->getLoopDepth() > 1) { bool HasOrderedReductions = - any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { - const RecurrenceDescriptor &RdxDesc = Reduction.second; - return RdxDesc.isOrdered(); - }); + any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + [](VPRecipeBase &R) { + auto *RedR = dyn_cast(&R); + + return RedR && RedR->isOrdered(); + }); if (HasOrderedReductions) { LLVM_DEBUG( dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); @@ -10115,7 +10149,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. - IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); + IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a5de5933d5ff1..758aea48eb930 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4229,7 +4229,10 @@ class VPlan { /// block with multiple predecessors (one for the exit via the latch and one /// via the other early exit). bool hasEarlyExit() const { - return ExitBlocks.size() > 1 || + return count_if(ExitBlocks, + [](VPIRBasicBlock *EB) { + return EB->getNumPredecessors() != 0; + }) > 1 || (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll index 1fcbc8470fc3c..e103a912ff360 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -19,7 +19,7 @@ target triple = "aarch64--linux-gnu" ; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7 ; ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) { entry: @@ -60,7 +60,7 @@ for.end: ; (store(4) + extractelement(4)) / 2 = 4 ; ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4 -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; CHECK: Cost of 4 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4 ; define void @predicated_store(ptr %a, i1 %c, i32 %x, i64 %n) { entry: @@ -93,8 +93,8 @@ for.end: ; CHECK: Found scalar instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ] ; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, ptr %addr, i64 1 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %addr, align 4 -; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ] -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4 +; CHECK: Cost of 0 for VF 2: induction instruction %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ] +; CHECK: Cost of 4 for VF 2: profitable to scalarize store i32 %tmp2, ptr %addr, align 4 ; define void @predicated_store_phi(ptr %a, i1 %c, i32 %x, i64 %n) { entry: @@ -135,9 +135,10 @@ for.end: ; ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Cost of 5 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3 ; + define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { entry: br label %for.body @@ -180,8 +181,8 @@ for.end: ; ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4 -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4 ; define void @predicated_store_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { entry: @@ -232,11 +233,11 @@ for.end: ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 ; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x ; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4 -; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x -; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, ptr %tmp0, align 4 +; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp5, ptr %tmp0, align 4 +; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp5 = sub i32 %tmp4, %x +; CHECK: Cost of 1 for VF 2: WIDEN ir<%tmp2> = add ir<%tmp1>, ir<%x> ; define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll index deb81099ac676..e24c5a1a2bf2d 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -27,7 +27,6 @@ for.end: ; CHECK: LV: Scalarizing: %tmp1 = load i32, ptr %tmp0, align 4 ; CHECK: LV: Scalarizing: store i32 %tmp2, ptr %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, ptr %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; CHECK: Cost of 4 for VF 4: REPLICATE ir<%tmp1> = load ir<%tmp0> +; CHECK: Cost of 4 for VF 4: REPLICATE store ir<%tmp2>, ir<%tmp0> } -