Skip to content

[VPlan] Add ExtractLane VPInst to extract across multiple parts. #148817

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10069,12 +10069,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
unsigned UserIC = Hints.getInterleave();
if (LVL.hasUncountableEarlyExit() && UserIC != 1) {
UserIC = 1;
reportVectorizationInfo("Interleaving not supported for loops "
"with uncountable early exits",
"InterleaveEarlyExitDisabled", ORE, L);
}

// Plan how to best vectorize.
LVP.plan(UserVF, UserIC);
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ReductionStartVector,
// Creates a step vector starting from 0 to VF with a step of 1.
StepVector,
/// Extracts a single lane (first operand) from a set of vector operands.
/// The lane specifies an index into a vector formed by combining all vector
/// operands (all operands after the first one).
ExtractLane,

};

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
return SetResultTyFromOp();
case VPInstruction::ExtractLane:
return inferScalarType(R->getOperand(1));
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
Expand Down
34 changes: 33 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,31 @@ Value *VPInstruction::generate(VPTransformState &State) {
Res = Builder.CreateOr(Res, State.get(Op));
return Builder.CreateOrReduce(Res);
}
case VPInstruction::ExtractLane: {
Value *LaneToExtract = State.get(getOperand(0), true);
Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
Value *Res = nullptr;
Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF);

for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
Value *VectorStart =
Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
Value *VectorIdx = Idx == 1
? LaneToExtract
: Builder.CreateSub(LaneToExtract, VectorStart);
Comment on lines +872 to +874
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought we used a ConstantFolder in the VPTransformState IR builder? Does this not get folded away when Idx == 1?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately that won't get folded by ConstantFolder I think, as the result isn't a Constant * but a Value * (sub %LaneToExtract, 0 - > `%LaneToExtract)

Value *Ext = State.VF.isScalar()
? State.get(getOperand(Idx))
: Builder.CreateExtractElement(
State.get(getOperand(Idx)), VectorIdx);
if (Res) {
Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
Res = Builder.CreateSelect(Cmp, Ext, Res);
} else {
Res = Ext;
}
}
return Res;
}
case VPInstruction::FirstActiveLane: {
if (getNumOperands() == 1) {
Value *Mask = State.get(getOperand(0));
Expand Down Expand Up @@ -918,7 +943,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}

switch (getOpcode()) {
case Instruction::ExtractElement: {
case Instruction::ExtractElement:
case VPInstruction::ExtractLane: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
Expand Down Expand Up @@ -980,6 +1006,7 @@ bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractLastElement ||
getOpcode() == VPInstruction::ExtractPenultimateElement ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::ExtractLane ||
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
Expand Down Expand Up @@ -1038,6 +1065,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::BuildVector:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLane:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
Expand All @@ -1063,6 +1091,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
default:
return false;
case Instruction::ExtractElement:
case VPInstruction::ExtractLane:
return Op == getOperand(1);
case Instruction::PHI:
return true;
Expand Down Expand Up @@ -1164,6 +1193,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BuildVector:
O << "buildvector";
break;
case VPInstruction::ExtractLane:
O << "extract-lane";
break;
case VPInstruction::ExtractLastElement:
O << "extract-last-element";
break;
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -774,10 +774,10 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
using namespace VPlanPatternMatch;

VPValue *Incoming, *Mask;
if (!match(Op, m_VPInstruction<Instruction::ExtractElement>(
m_VPValue(Incoming),
if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
m_VPInstruction<VPInstruction::FirstActiveLane>(
m_VPValue(Mask)))))
m_VPValue(Mask)),
m_VPValue(Incoming))))
return nullptr;

auto *WideIV = getOptimizableIVOf(Incoming);
Expand Down Expand Up @@ -2831,7 +2831,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
"first.active.lane");
IncomingFromEarlyExit = EarlyExitB.createNaryOp(
Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
nullptr, "early.exit.value");
ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
}
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,13 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
continue;
}
VPValue *Op0;
if (match(&R, m_VPInstruction<VPInstruction::ExtractLane>(
m_VPValue(Op0), m_VPValue(Op1)))) {
addUniformForAllParts(cast<VPInstruction>(&R));
for (unsigned Part = 1; Part != UF; ++Part)
R.addOperand(getValueForPart(Op1, Part));
continue;
}
if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>(
m_VPValue(Op0))) ||
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,60 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 64
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 510, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 64
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 510, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 510, [[N_MOD_VF]]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 64
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 3, [[N_VEC]]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP19]]
; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP36:%.*]] = mul nuw i64 [[TMP29]], 32
; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP36]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP15]], 48
; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP38]]
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP37]], align 1
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP54]], align 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 32
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 48
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP27]]
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP22]], align 1
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
; CHECK-NEXT: [[TMP30:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
; CHECK-NEXT: [[TMP59:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD9]]
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
; CHECK-NEXT: [[TMP33:%.*]] = or <vscale x 16 x i1> [[TMP32]], [[TMP30]]
; CHECK-NEXT: [[TMP34:%.*]] = or <vscale x 16 x i1> [[TMP33]], [[TMP31]]
; CHECK-NEXT: [[TMP35:%.*]] = or <vscale x 16 x i1> [[TMP34]], [[TMP59]]
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP35]])
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
Expand All @@ -46,8 +77,27 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP40:%.*]] = mul nuw i64 [[TMP39]], 16
; CHECK-NEXT: [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 true)
; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[TMP40]], 3
; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[TMP42]], [[TMP41]]
; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP40]], 2
; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], [[TMP44]]
; CHECK-NEXT: [[TMP47:%.*]] = icmp ne i64 [[TMP44]], [[TMP40]]
; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[TMP43]]
; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP40]], 1
; CHECK-NEXT: [[TMP51:%.*]] = add i64 [[TMP50]], [[TMP49]]
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne i64 [[TMP49]], [[TMP40]]
; CHECK-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i64 [[TMP51]], i64 [[TMP48]]
; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP40]], 0
; CHECK-NEXT: [[TMP56:%.*]] = add i64 [[TMP55]], [[TMP61]]
; CHECK-NEXT: [[TMP57:%.*]] = icmp ne i64 [[TMP61]], [[TMP40]]
; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP57]], i64 [[TMP56]], i64 [[TMP53]]
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP58]]
; CHECK-NEXT: [[TMP17:%.*]] = add i64 3, [[TMP16]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; REQUIRES: asserts
; RUN: opt -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 \
; RUN: -debug-only=loop-vectorize -S %s 2>%t | FileCheck --check-prefix=VF4IC4 %s
; RUN: cat %t | FileCheck --check-prefix=DEBUG %s
; RUN: opt -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 -S %s | FileCheck --check-prefix=VF4IC4 %s

declare void @init_mem(ptr, i64);

; DEBUG: Interleaving not supported for loops with uncountable early exits

define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
; VF4IC4-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
; VF4IC4-NEXT: [[ENTRY:.*]]:
Expand All @@ -20,10 +15,22 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
; VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
; VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
; VF4IC4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
; VF4IC4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
; VF4IC4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10)
; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
; VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], splat (i32 10)
; VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD2]], splat (i32 10)
; VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD3]], splat (i32 10)
; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; VF4IC4-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP2]], [[TMP6]]
; VF4IC4-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP7]]
; VF4IC4-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP8]]
; VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
; VF4IC4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
; VF4IC4-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
; VF4IC4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
Expand Down
Loading
Loading