-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[LV] Provide utility routine to find uncounted exit recipes #152530
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,3 +138,110 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) { | |
}); | ||
return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I); | ||
} | ||
|
||
std::optional<VPValue *> | ||
vputils::getRecipesForUncountedExit(VPlan &Plan, | ||
SmallVectorImpl<VPRecipeBase *> &Recipes, | ||
SmallVectorImpl<VPRecipeBase *> &GEPs) { | ||
using namespace llvm::VPlanPatternMatch; | ||
// Given a vplan like the following (just including the recipes contributing | ||
// to loop control exiting here, not the actual work), we're looking to match | ||
// the recipes contributing to the uncounted exit condition comparison | ||
// (here, vp<%4>) back to the canonical induction for the vector body so that | ||
// we can copy them to a preheader and rotate the address in the loop to the | ||
// next vector iteration. | ||
// | ||
// VPlan ' for UF>=1' { | ||
// Live-in vp<%0> = VF | ||
// Live-in ir<64> = original trip-count | ||
// | ||
// entry: | ||
// Successor(s): preheader, vector.ph | ||
// | ||
// vector.ph: | ||
// Successor(s): vector loop | ||
// | ||
// <x1> vector loop: { | ||
// vector.body: | ||
// EMIT vp<%2> = CANONICAL-INDUCTION ir<0> | ||
// vp<%3> = SCALAR-STEPS vp<%2>, ir<1>, vp<%0> | ||
// CLONE ir<%ee.addr> = getelementptr ir<0>, vp<%3> | ||
// WIDEN ir<%ee.load> = load ir<%ee.addr> | ||
// WIDEN vp<%4> = icmp eq ir<%ee.load>, ir<0> | ||
// EMIT vp<%5> = any-of vp<%4> | ||
// EMIT vp<%6> = add vp<%2>, vp<%0> | ||
// EMIT vp<%7> = icmp eq vp<%6>, ir<64> | ||
// EMIT vp<%8> = or vp<%5>, vp<%7> | ||
// EMIT branch-on-cond vp<%8> | ||
// No successors | ||
// } | ||
// Successor(s): middle.block | ||
// | ||
// middle.block: | ||
// Successor(s): preheader | ||
// | ||
// preheader: | ||
// No successors | ||
// } | ||
|
||
// Find the uncounted loop exit condition. | ||
auto *Region = Plan.getVectorLoopRegion(); | ||
VPValue *UncountedCondition = nullptr; | ||
if (!match( | ||
Region->getExitingBasicBlock()->getTerminator(), | ||
m_BranchOnCond(m_OneUse(m_c_BinaryOr( | ||
m_OneUse(m_AnyOf(m_VPValue(UncountedCondition))), m_VPValue()))))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it matter if this is a loop with an uncountable early exit that doesn't match this pattern? If it's being called as part of VPlanTransforms::handleEarlyExits then it should be fine, but later on I can imagine a VPlanTransform may optimise some of this code. I assume that returning There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we can't match the expected recipes here, then the transform (from #148626) will abandon the vplan and vectorization will not proceed. It's the reason I can do this as a vplan transform instead of manually planting recipes as part of the initial vplan creation based on what LoopVectorizationLegality finds -- I did it the latter way in 2015 when I originally prototyped early exit autovec. |
||
return std::nullopt; | ||
|
||
SmallVector<VPValue *, 4> Worklist; | ||
SmallVector<VPWidenLoadRecipe *, 1> Loads; | ||
Worklist.push_back(UncountedCondition); | ||
while (!Worklist.empty()) { | ||
VPValue *V = Worklist.pop_back_val(); | ||
|
||
// Any value defined outside the loop does not need to be copied. | ||
if (V->isDefinedOutsideLoopRegions()) | ||
continue; | ||
|
||
// FIXME: Remove the single user restriction; it's here because we're | ||
// starting with the simplest set of loops we can, and multiple | ||
// users means needing to add PHI nodes in the transform. | ||
if (V->getNumUsers() > 1) | ||
return std::nullopt; | ||
Comment on lines
+209
to
+210
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the issue with having multiple users? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In theory, nothing, except that I'm trying to make the simplest version I can to start with to aid with reviewing, and I can then gradually increase what the code deals with. In this case, having more than one user may mean the transform needs to create a PHI node for the value, since these nodes will be copied to the preheader and the versions inside the loop rotated to the next vector iteration. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense, you don't want the next vector iteration to have to recompute this again. Could you describe that in the comment? (possibly with a FIXME to add PHI nodes for these in the future) |
||
|
||
// Walk back through recipes until we find at least one load from memory. | ||
if (auto *Cmp = dyn_cast<VPWidenRecipe>(V)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Maybe also add a comment saying that this only supports a very specific case, but that this can be generalised in the future to other expressions as long as they no side-effects? (or something along those lines) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could make it just match the exact IR (as I did in #137774) if needed, though I wanted to show the basics of what the code will look like for matching more loops later. This effectively has the same limitations as the straightline code, but can be extended easily in the future. I suppose I could build up a list of loads to check, then do a more exacting match there. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The checks on that can be moved to the transform itself, which currently assumes that it's transforming a loop which has an uncounted condition based on at least one load that can be cloned into the preheader, and at least one GEP which must be adjusted. |
||
if (Cmp->getOpcode() != Instruction::ICmp) | ||
return std::nullopt; | ||
Worklist.push_back(Cmp->getOperand(0)); | ||
Worklist.push_back(Cmp->getOperand(1)); | ||
Recipes.push_back(Cmp); | ||
} else if (auto *Load = dyn_cast<VPWidenLoadRecipe>(V)) { | ||
// Reject masked loads for the time being; they make the exit condition | ||
// more complex. | ||
if (Load->isMasked()) | ||
return std::nullopt; | ||
Loads.push_back(Load); | ||
} else | ||
return std::nullopt; | ||
} | ||
|
||
// Check the loads for exact patterns; for now we only support a contiguous | ||
// load based directly on the canonical IV with a step of 1. | ||
for (VPWidenLoadRecipe *Load : Loads) { | ||
Recipes.push_back(Load); | ||
VPValue *GEP = Load->getAddr(); | ||
|
||
if (!match(GEP, m_GetElementPtr( | ||
m_LoopInvVPValue(), | ||
m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), | ||
m_SpecificInt(1), | ||
m_Specific(&Plan.getVF()))))) | ||
return std::nullopt; | ||
|
||
Recipes.push_back(GEP->getDefiningRecipe()); | ||
GEPs.push_back(GEP->getDefiningRecipe()); | ||
} | ||
|
||
return UncountedCondition; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -148,6 +148,8 @@ class LLVM_ABI_FOR_TEST VPValue { | |
return Current != user_end(); | ||
} | ||
|
||
bool hasOneUse() const { return getNumUsers() == 1; } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The name of the function doesn't match the code, i.e. one use != one user. I think it should either be:
or
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uses are not directly modeled in VPlan, afaict. Instead, a User may be recorded multiple times for a given VPValue. See the function directly above this, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @fhahn, I think this is pretty confusing. Can we rename the existing |
||
|
||
void replaceAllUsesWith(VPValue *New); | ||
|
||
/// Go through the uses list for this VPValue and make each use point to \p | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
//===- llvm/unittests/Transforms/Vectorize/VPlanUncountedExitTest.cpp -----===// | ||
// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "../lib/Transforms/Vectorize/LoopVectorizationPlanner.h" | ||
#include "../lib/Transforms/Vectorize/VPlan.h" | ||
#include "../lib/Transforms/Vectorize/VPlanPatternMatch.h" | ||
#include "../lib/Transforms/Vectorize/VPlanUtils.h" | ||
#include "VPlanTestBase.h" | ||
#include "llvm/ADT/SmallVector.h" | ||
#include "llvm/IR/Instruction.h" | ||
#include "llvm/IR/Instructions.h" | ||
#include "gtest/gtest.h" | ||
|
||
namespace llvm { | ||
|
||
namespace { | ||
class VPUncountedExitTest : public VPlanTestBase {}; | ||
|
||
TEST_F(VPUncountedExitTest, FindUncountedExitRecipes) { | ||
// Create CFG skeleton. | ||
VPlan &Plan = getPlan(); | ||
VPBasicBlock *ScalarPH = Plan.getEntry(); | ||
VPBasicBlock *Entry = Plan.createVPBasicBlock("entry"); | ||
Plan.setEntry(Entry); | ||
VPBasicBlock *VectorPH = Plan.createVPBasicBlock("vector.ph"); | ||
VPBasicBlock *VecBody = Plan.createVPBasicBlock("vector.body"); | ||
VPRegionBlock *Region = | ||
Plan.createVPRegionBlock(VecBody, VecBody, "vector loop"); | ||
VPBasicBlock *MiddleBlock = Plan.createVPBasicBlock("middle.block"); | ||
VPBlockUtils::connectBlocks(Entry, ScalarPH); | ||
VPBlockUtils::connectBlocks(Entry, VectorPH); | ||
VPBlockUtils::connectBlocks(VectorPH, Region); | ||
VPBlockUtils::connectBlocks(Region, MiddleBlock); | ||
VPBlockUtils::connectBlocks(MiddleBlock, ScalarPH); | ||
|
||
// Live-Ins | ||
IntegerType *I64Ty = IntegerType::get(C, 64); | ||
IntegerType *I32Ty = IntegerType::get(C, 32); | ||
PointerType *PTy = PointerType::get(C, 0); | ||
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0)); | ||
VPValue *Inc = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1)); | ||
VPValue *VF = &Plan.getVF(); | ||
Plan.setTripCount(Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 64))); | ||
|
||
// Populate vector.body with the recipes for exiting. | ||
auto *IV = new VPCanonicalIVPHIRecipe(Zero, {}); | ||
VecBody->appendRecipe(IV); | ||
VPBuilder Builder(VecBody, VecBody->getFirstNonPhi()); | ||
auto *Steps = Builder.createScalarIVSteps(Instruction::Add, nullptr, IV, Inc, | ||
VF, DebugLoc()); | ||
|
||
// Uncounted Exit; GEP -> Load -> Cmp | ||
auto *DummyGEP = GetElementPtrInst::Create(I32Ty, Zero->getUnderlyingValue(), | ||
{}, Twine("ee.addr")); | ||
auto *GEP = new VPReplicateRecipe(DummyGEP, {Zero, Steps}, true, nullptr); | ||
Builder.insert(GEP); | ||
auto *DummyLoad = | ||
new LoadInst(I32Ty, PoisonValue::get(PTy), "ee.load", false, Align(1)); | ||
VPValue *Load = | ||
new VPWidenLoadRecipe(*DummyLoad, GEP, nullptr, true, false, {}, {}); | ||
Builder.insert(Load->getDefiningRecipe()); | ||
// Should really splat the zero, but we're not checking types here. | ||
VPValue *Cmp = new VPWidenRecipe(Instruction::ICmp, {Load, Zero}, | ||
VPIRFlags(CmpInst::ICMP_EQ), {}, {}); | ||
Builder.insert(Cmp->getDefiningRecipe()); | ||
VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, Cmp); | ||
|
||
// Counted Exit; Inc IV -> Cmp | ||
VPValue *NextIV = Builder.createNaryOp(Instruction::Add, {IV, VF}); | ||
VPValue *Counted = | ||
Builder.createICmp(CmpInst::ICMP_EQ, NextIV, Plan.getTripCount()); | ||
|
||
// Combine, and branch. | ||
VPValue *Combined = Builder.createNaryOp(Instruction::Or, {AnyOf, Counted}); | ||
Builder.createNaryOp(VPInstruction::BranchOnCond, {Combined}); | ||
|
||
SmallVector<VPRecipeBase *, 8> Recipes; | ||
SmallVector<VPRecipeBase *, 2> GEPs; | ||
|
||
std::optional<VPValue *> UncountedCondition = | ||
vputils::getRecipesForUncountedExit(Plan, Recipes, GEPs); | ||
ASSERT_TRUE(UncountedCondition.has_value()); | ||
ASSERT_EQ(*UncountedCondition, Cmp); | ||
ASSERT_EQ(GEPs.size(), 1ull); | ||
ASSERT_EQ(GEPs[0], GEP); | ||
ASSERT_EQ(Recipes.size(), 3ull); | ||
|
||
delete DummyLoad; | ||
delete DummyGEP; | ||
} | ||
|
||
} // namespace | ||
} // namespace llvm |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The IR above suggests we've already called
handleUncountableEarlyExit
and so this probably should be// Successor(s): middle.split
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The transform PR explicitly avoids creating the split, since we won't have finished all iterations before leaving the vector body.