Skip to content

Commit 7850d74

Browse files
committed
[VPlan] Add ExtractLane VPInst to extract across multiple parts.
This patch adds a new ExtractLane VPInstruction which extracts across multiple parts using a wide index, to be used in combination with FirstActiveLane. The patch updates early-exit codegen to use it instead ExtractElement, which is only per-part. With this change, interleaving should work correctly with early-exit loops. The patch removes the restrictions added in 6f43754 (llvm#145877), but does not yet automatically select interleave counts > 1 for early-exit loops. I'll share a patch as follow-up. The cost of extracting a lane adds non-trivial overhead in the exit block, so that should be considered when picking the interleave count.
1 parent 4dc989d commit 7850d74

File tree

10 files changed

+480
-79
lines changed

10 files changed

+480
-79
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10046,12 +10046,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1004610046
// Get user vectorization factor and interleave count.
1004710047
ElementCount UserVF = Hints.getWidth();
1004810048
unsigned UserIC = Hints.getInterleave();
10049-
if (LVL.hasUncountableEarlyExit() && UserIC != 1) {
10050-
UserIC = 1;
10051-
reportVectorizationInfo("Interleaving not supported for loops "
10052-
"with uncountable early exits",
10053-
"InterleaveEarlyExitDisabled", ORE, L);
10054-
}
1005510049

1005610050
// Plan how to best vectorize.
1005710051
LVP.plan(UserVF, UserIC);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,6 +1012,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10121012
ReductionStartVector,
10131013
// Creates a step vector starting from 0 to VF with a step of 1.
10141014
StepVector,
1015+
/// Extracts a single lane (first operand) from a set of vector operands.
1016+
/// The lane specifies an index into a vector formed by combining all vector
1017+
/// operands (all operands after the first one).
1018+
ExtractLane,
10151019

10161020
};
10171021

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
109109
case VPInstruction::BuildStructVector:
110110
case VPInstruction::BuildVector:
111111
return SetResultTyFromOp();
112+
case VPInstruction::ExtractLane:
113+
return inferScalarType(R->getOperand(1));
112114
case VPInstruction::FirstActiveLane:
113115
return Type::getIntNTy(Ctx, 64);
114116
case VPInstruction::ExtractLastElement:

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -862,6 +862,31 @@ Value *VPInstruction::generate(VPTransformState &State) {
862862
Res = Builder.CreateOr(Res, State.get(Op));
863863
return Builder.CreateOrReduce(Res);
864864
}
865+
case VPInstruction::ExtractLane: {
866+
Value *LaneToExtract = State.get(getOperand(0), true);
867+
Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
868+
Value *Res = nullptr;
869+
Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF);
870+
871+
for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
872+
Value *VectorStart =
873+
Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
874+
Value *VectorIdx = Idx == 1
875+
? LaneToExtract
876+
: Builder.CreateSub(LaneToExtract, VectorStart);
877+
Value *Ext = State.VF.isScalar()
878+
? State.get(getOperand(Idx))
879+
: Builder.CreateExtractElement(
880+
State.get(getOperand(Idx)), VectorIdx);
881+
if (Res) {
882+
Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
883+
Res = Builder.CreateSelect(Cmp, Ext, Res);
884+
} else {
885+
Res = Ext;
886+
}
887+
}
888+
return Res;
889+
}
865890
case VPInstruction::FirstActiveLane: {
866891
if (getNumOperands() == 1) {
867892
Value *Mask = State.get(getOperand(0));
@@ -920,7 +945,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
920945
}
921946

922947
switch (getOpcode()) {
923-
case Instruction::ExtractElement: {
948+
case Instruction::ExtractElement:
949+
case VPInstruction::ExtractLane: {
924950
// Add on the cost of extracting the element.
925951
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
926952
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -982,6 +1008,7 @@ bool VPInstruction::isVectorToScalar() const {
9821008
return getOpcode() == VPInstruction::ExtractLastElement ||
9831009
getOpcode() == VPInstruction::ExtractPenultimateElement ||
9841010
getOpcode() == Instruction::ExtractElement ||
1011+
getOpcode() == VPInstruction::ExtractLane ||
9851012
getOpcode() == VPInstruction::FirstActiveLane ||
9861013
getOpcode() == VPInstruction::ComputeAnyOfResult ||
9871014
getOpcode() == VPInstruction::ComputeFindIVResult ||
@@ -1040,6 +1067,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
10401067
case VPInstruction::BuildVector:
10411068
case VPInstruction::CalculateTripCountMinusVF:
10421069
case VPInstruction::CanonicalIVIncrementForPart:
1070+
case VPInstruction::ExtractLane:
10431071
case VPInstruction::ExtractLastElement:
10441072
case VPInstruction::ExtractPenultimateElement:
10451073
case VPInstruction::FirstActiveLane:
@@ -1065,6 +1093,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
10651093
default:
10661094
return false;
10671095
case Instruction::ExtractElement:
1096+
case VPInstruction::ExtractLane:
10681097
return Op == getOperand(1);
10691098
case Instruction::PHI:
10701099
return true;
@@ -1166,6 +1195,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
11661195
case VPInstruction::BuildVector:
11671196
O << "buildvector";
11681197
break;
1198+
case VPInstruction::ExtractLane:
1199+
O << "extract-lane";
1200+
break;
11691201
case VPInstruction::ExtractLastElement:
11701202
O << "extract-last-element";
11711203
break;

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -774,10 +774,10 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
774774
using namespace VPlanPatternMatch;
775775

776776
VPValue *Incoming, *Mask;
777-
if (!match(Op, m_VPInstruction<Instruction::ExtractElement>(
778-
m_VPValue(Incoming),
777+
if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
779778
m_VPInstruction<VPInstruction::FirstActiveLane>(
780-
m_VPValue(Mask)))))
779+
m_VPValue(Mask)),
780+
m_VPValue(Incoming))))
781781
return nullptr;
782782

783783
auto *WideIV = getOptimizableIVOf(Incoming);
@@ -2831,7 +2831,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
28312831
VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
28322832
"first.active.lane");
28332833
IncomingFromEarlyExit = EarlyExitB.createNaryOp(
2834-
Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
2834+
VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
28352835
nullptr, "early.exit.value");
28362836
ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
28372837
}

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,13 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
363363
continue;
364364
}
365365
VPValue *Op0;
366+
if (match(&R, m_VPInstruction<VPInstruction::ExtractLane>(
367+
m_VPValue(Op0), m_VPValue(Op1)))) {
368+
addUniformForAllParts(cast<VPInstruction>(&R));
369+
for (unsigned Part = 1; Part != UF; ++Part)
370+
R.addOperand(getValueForPart(Op1, Part));
371+
continue;
372+
}
366373
if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>(
367374
m_VPValue(Op0))) ||
368375
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(

llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,29 +14,60 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
1414
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
1515
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
1616
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
17-
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
18-
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
17+
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 64
18+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 510, [[TMP1]]
19+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1920
; CHECK: vector.ph:
2021
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
21-
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
22+
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 64
2223
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 510, [[TMP3]]
2324
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 510, [[N_MOD_VF]]
2425
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
25-
; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
26+
; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 64
2627
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 3, [[N_VEC]]
2728
; CHECK-NEXT: br label [[LOOP:%.*]]
2829
; CHECK: vector.body:
2930
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
3031
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
3132
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
3233
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
34+
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
35+
; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16
36+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP19]]
37+
; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
38+
; CHECK-NEXT: [[TMP36:%.*]] = mul nuw i64 [[TMP29]], 32
39+
; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP36]]
40+
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
41+
; CHECK-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP15]], 48
42+
; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP38]]
3343
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
44+
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
45+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP37]], align 1
46+
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP54]], align 1
3447
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
3548
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
49+
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
50+
; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
51+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP21]]
52+
; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
53+
; CHECK-NEXT: [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 32
54+
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP24]]
55+
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
56+
; CHECK-NEXT: [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 48
57+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP27]]
3658
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
59+
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP22]], align 1
60+
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
61+
; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
3762
; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
63+
; CHECK-NEXT: [[TMP30:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
64+
; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
65+
; CHECK-NEXT: [[TMP59:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD9]]
3866
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
39-
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
67+
; CHECK-NEXT: [[TMP33:%.*]] = or <vscale x 16 x i1> [[TMP32]], [[TMP30]]
68+
; CHECK-NEXT: [[TMP34:%.*]] = or <vscale x 16 x i1> [[TMP33]], [[TMP31]]
69+
; CHECK-NEXT: [[TMP35:%.*]] = or <vscale x 16 x i1> [[TMP34]], [[TMP59]]
70+
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP35]])
4071
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
4172
; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
4273
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -46,8 +77,27 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
4677
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
4778
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
4879
; CHECK: vector.early.exit:
80+
; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
81+
; CHECK-NEXT: [[TMP40:%.*]] = mul nuw i64 [[TMP39]], 16
82+
; CHECK-NEXT: [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 true)
83+
; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[TMP40]], 3
84+
; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[TMP42]], [[TMP41]]
85+
; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
86+
; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP40]], 2
87+
; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], [[TMP44]]
88+
; CHECK-NEXT: [[TMP47:%.*]] = icmp ne i64 [[TMP44]], [[TMP40]]
89+
; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[TMP43]]
90+
; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
91+
; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP40]], 1
92+
; CHECK-NEXT: [[TMP51:%.*]] = add i64 [[TMP50]], [[TMP49]]
93+
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne i64 [[TMP49]], [[TMP40]]
94+
; CHECK-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i64 [[TMP51]], i64 [[TMP48]]
4995
; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
50-
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
96+
; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP40]], 0
97+
; CHECK-NEXT: [[TMP56:%.*]] = add i64 [[TMP55]], [[TMP61]]
98+
; CHECK-NEXT: [[TMP57:%.*]] = icmp ne i64 [[TMP61]], [[TMP40]]
99+
; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP57]], i64 [[TMP56]], i64 [[TMP53]]
100+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP58]]
51101
; CHECK-NEXT: [[TMP17:%.*]] = add i64 3, [[TMP16]]
52102
; CHECK-NEXT: br label [[LOOP_END]]
53103
; CHECK: scalar.ph:

llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; REQUIRES: asserts
3-
; RUN: opt -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 \
4-
; RUN: -debug-only=loop-vectorize -S %s 2>%t | FileCheck --check-prefix=VF4IC4 %s
5-
; RUN: cat %t | FileCheck --check-prefix=DEBUG %s
2+
; RUN: opt -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 -S %s | FileCheck --check-prefix=VF4IC4 %s
63

74
declare void @init_mem(ptr, i64);
85

9-
; DEBUG: Interleaving not supported for loops with uncountable early exits
10-
116
define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
127
; VF4IC4-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
138
; VF4IC4-NEXT: [[ENTRY:.*]]:
@@ -20,10 +15,22 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
2015
; VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
2116
; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
2217
; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
18+
; VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
19+
; VF4IC4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
20+
; VF4IC4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
2321
; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
22+
; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
23+
; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
24+
; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
2425
; VF4IC4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10)
25-
; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
26-
; VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
26+
; VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], splat (i32 10)
27+
; VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD2]], splat (i32 10)
28+
; VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD3]], splat (i32 10)
29+
; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
30+
; VF4IC4-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP2]], [[TMP6]]
31+
; VF4IC4-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP7]]
32+
; VF4IC4-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP8]]
33+
; VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
2734
; VF4IC4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
2835
; VF4IC4-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
2936
; VF4IC4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

0 commit comments

Comments
 (0)