diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index f57ce0c3ccb4d..5a6883fd8d5c3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -487,6 +487,9 @@ class LoopVectorizationPlanner { /// all profitable VFs in ProfitableVFs. VectorizationFactor computeBestVF(); + unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, + InstructionCost LoopCost); + /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan /// according to the best selected \p VF and \p UF. /// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 773c1559ec679..7f48d79a67192 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -955,13 +955,6 @@ class LoopVectorizationCostModel { /// 64 bit loop indices. std::pair getSmallestAndWidestTypes(); - /// \return The desired interleave count. - /// If interleave count has been specified by metadata it will be returned. - /// Otherwise, the interleave count is computed and returned. VF and LoopCost - /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, - InstructionCost LoopCost); - /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. /// This function takes cost-based decisions for Load/Store instructions @@ -4611,8 +4604,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { } unsigned -LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, - InstructionCost LoopCost) { +LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, + InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict @@ -4627,11 +4620,11 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!isScalarEpilogueAllowed()) + if (!CM.isScalarEpilogueAllowed()) return 1; - // Do not interleave if EVL is preferred and no User IC is specified. - if (foldTailWithEVL()) { + if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + IsaPred)) { LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " "Unroll factor forced to be 1.\n"); return 1; @@ -4644,15 +4637,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // We don't attempt to perform interleaving for loops with uncountable early // exits because the VPInstruction::AnyOf code cannot currently handle // multiple parts. - if (Legal->hasUncountableEarlyExit()) + if (Plan.hasEarlyExit()) return 1; - const bool HasReductions = !Legal->getReductionVars().empty(); + const bool HasReductions = + any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + IsaPred); // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { - LoopCost = expectedCost(VF); + if (VF.isScalar()) + LoopCost = CM.expectedCost(VF); + else + LoopCost = cost(Plan, VF); assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); // Loop body is free and there is no need for interleaving. @@ -4661,7 +4659,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, } VPRegisterUsage R = - calculateRegisterUsageForPlan(Plan, {VF}, TTI, ValuesToIgnore)[0]; + calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. for (auto &Pair : R.MaxLocalUsers) { @@ -4722,21 +4720,21 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning); + unsigned EstimatedVF = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning()); // Try to get the exact trip count, or an estimate based on profiling data or // ConstantMax from PSE, failing that. - if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) { + if (auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop)) { // At least one iteration must be scalar when this constraint holds. So the // maximum available iterations for interleaving is one less. - unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) + unsigned AvailableTC = CM.requiresScalarEpilogue(VF.isVector()) ? BestKnownTC->getFixedValue() - 1 : BestKnownTC->getFixedValue(); unsigned InterleaveCountLB = bit_floor(std::max( 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); - if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) { + if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) { // If the best known trip count is exact, we select between two // prospective ICs, where // @@ -4797,7 +4795,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // vectorized the loop we will have done the runtime check and so interleaving // won't require further checks. bool ScalarInterleavingRequiresPredication = - (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { + (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); })); bool ScalarInterleavingRequiresRuntimePointerCheck = @@ -4820,8 +4818,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // Interleave until store/load ports (estimated by max interleave count) are // saturated. - unsigned NumStores = Legal->getNumStores(); - unsigned NumLoads = Legal->getNumLoads(); + unsigned NumStores = 0; + unsigned NumLoads = 0; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + if (isa(&R)) { + NumLoads++; + continue; + } + if (isa(&R)) { + NumStores++; + continue; + } + + if (auto *InterleaveR = dyn_cast(&R)) { + if (unsigned StoreOps = InterleaveR->getNumStoreOperands()) + NumStores += StoreOps; + else + NumLoads += InterleaveR->getNumDefinedValues(); + continue; + } + if (auto *RepR = dyn_cast(&R)) { + NumLoads += isa(RepR->getUnderlyingInstr()); + NumStores += isa(RepR->getUnderlyingInstr()); + continue; + } + if (isa(&R)) { + NumLoads++; + NumStores++; + continue; + } + } + } unsigned StoresIC = IC / (NumStores ? NumStores : 1); unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); @@ -4831,12 +4860,15 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // do the final reduction after the loop. bool HasSelectCmpReductions = HasReductions && - any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { - const RecurrenceDescriptor &RdxDesc = Reduction.second; - RecurKind RK = RdxDesc.getRecurrenceKind(); - return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || - RecurrenceDescriptor::isFindIVRecurrenceKind(RK); - }); + any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + [](VPRecipeBase &R) { + auto *RedR = dyn_cast(&R); + + return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RedR->getRecurrenceKind()) || + RecurrenceDescriptor::isFindIVRecurrenceKind( + RedR->getRecurrenceKind())); + }); if (HasSelectCmpReductions) { LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); return 1; @@ -4847,12 +4879,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // we're interleaving is inside another loop. For tree-wise reductions // set the limit to 2, and for ordered reductions it's best to disable // interleaving entirely. - if (HasReductions && TheLoop->getLoopDepth() > 1) { + if (HasReductions && OrigLoop->getLoopDepth() > 1) { bool HasOrderedReductions = - any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { - const RecurrenceDescriptor &RdxDesc = Reduction.second; - return RdxDesc.isOrdered(); - }); + any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + [](VPRecipeBase &R) { + auto *RedR = dyn_cast(&R); + + return RedR && RedR->isOrdered(); + }); if (HasOrderedReductions) { LLVM_DEBUG( dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); @@ -10071,8 +10105,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); if (LVP.hasPlanWithVF(VF.Width)) { + VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(), + CM, CM.CostKind); + // Select the interleave count. - IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); + IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if @@ -10083,8 +10120,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; - VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(), - CM, CM.CostKind); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index db40ce2d20b81..d759f64d1098b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4213,7 +4213,10 @@ class VPlan { /// block with multiple predecessors (one for the exit via the latch and one /// via the other early exit). bool hasEarlyExit() const { - return ExitBlocks.size() > 1 || + return count_if(ExitBlocks, + [](VPIRBasicBlock *EB) { + return EB->getNumPredecessors() != 0; + }) > 1 || (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll index 1fcbc8470fc3c..e103a912ff360 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -19,7 +19,7 @@ target triple = "aarch64--linux-gnu" ; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7 ; ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) { entry: @@ -60,7 +60,7 @@ for.end: ; (store(4) + extractelement(4)) / 2 = 4 ; ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4 -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; CHECK: Cost of 4 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4 ; define void @predicated_store(ptr %a, i1 %c, i32 %x, i64 %n) { entry: @@ -93,8 +93,8 @@ for.end: ; CHECK: Found scalar instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ] ; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, ptr %addr, i64 1 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %addr, align 4 -; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ] -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4 +; CHECK: Cost of 0 for VF 2: induction instruction %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ] +; CHECK: Cost of 4 for VF 2: profitable to scalarize store i32 %tmp2, ptr %addr, align 4 ; define void @predicated_store_phi(ptr %a, i1 %c, i32 %x, i64 %n) { entry: @@ -135,9 +135,10 @@ for.end: ; ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Cost of 5 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3 ; + define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { entry: br label %for.body @@ -180,8 +181,8 @@ for.end: ; ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4 -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4 ; define void @predicated_store_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { entry: @@ -232,11 +233,11 @@ for.end: ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 ; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x ; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4 -; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x -; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, ptr %tmp0, align 4 +; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp5, ptr %tmp0, align 4 +; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp5 = sub i32 %tmp4, %x +; CHECK: Cost of 1 for VF 2: WIDEN ir<%tmp2> = add ir<%tmp1>, ir<%x> ; define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index f59ab56711685..c0287a0beb2f9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -197,259 +197,6 @@ exit: } define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) { -; RV64-LABEL: define void @vector_reverse_i64( -; RV64-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*:]] -; RV64-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 -; RV64-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 -; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV64: [[FOR_BODY_PREHEADER]]: -; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; RV64: [[VECTOR_SCEVCHECK]]: -; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 -; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 -; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) -; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] -; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] -; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 -; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] -; RV64: [[VECTOR_MEMCHECK]]: -; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 -; RV64-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 -; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]] -; RV64-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]] -; RV64-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] -; RV64: [[VECTOR_PH]]: -; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV64-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 -; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-NEXT: [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]] -; RV64-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64: [[VECTOR_BODY]]: -; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 -; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] -; RV64-NEXT: [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 -; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP22]] -; RV64-NEXT: [[TMP24:%.*]] = mul i64 0, [[TMP18]] -; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP18]], 1 -; RV64-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP25]] -; RV64-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP24]] -; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP26]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP28]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP29:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] -; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP18]] -; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP18]], 1 -; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]] -; RV64-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i64 [[TMP31]] -; RV64-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i64 [[TMP33]] -; RV64-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP29]]) -; RV64-NEXT: store [[REVERSE4]], ptr [[TMP35]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] -; RV64-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV64-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV64: [[FOR_COND_CLEANUP]]: -; RV64-NEXT: ret void -; RV64: [[FOR_BODY]]: -; -; RV32-LABEL: define void @vector_reverse_i64( -; RV32-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { -; RV32-NEXT: [[ENTRY:.*:]] -; RV32-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i32 -; RV32-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i32 -; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV32: [[FOR_BODY_PREHEADER]]: -; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] -; RV32: [[VECTOR_MEMCHECK]]: -; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() -; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4 -; RV32-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4 -; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]] -; RV32-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]] -; RV32-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] -; RV32: [[VECTOR_PH]]: -; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV32-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4 -; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV32-NEXT: [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]] -; RV32-NEXT: br label %[[VECTOR_BODY:.*]] -; RV32: [[VECTOR_BODY]]: -; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV32-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 -; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] -; RV32-NEXT: [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 -; RV32-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 -; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP14]] -; RV32-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32 -; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP16]] -; RV32-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], 1 -; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP18]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 [[TMP17]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP22:%.*]] = add [[REVERSE]], splat (i32 1) -; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; RV32-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32 -; RV32-NEXT: [[TMP25:%.*]] = mul i32 0, [[TMP24]] -; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], 1 -; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]] -; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP25]] -; RV32-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 [[TMP27]] -; RV32-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP22]]) -; RV32-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV32-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] -; RV32-NEXT: br label %[[FOR_BODY:.*]] -; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV32: [[FOR_COND_CLEANUP]]: -; RV32-NEXT: ret void -; RV32: [[FOR_BODY]]: -; -; RV64-UF2-LABEL: define void @vector_reverse_i64( -; RV64-UF2-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { -; RV64-UF2-NEXT: [[ENTRY:.*:]] -; RV64-UF2-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 -; RV64-UF2-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 -; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV64-UF2: [[FOR_BODY_PREHEADER]]: -; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8 -; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; RV64-UF2: [[VECTOR_SCEVCHECK]]: -; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 -; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 -; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) -; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] -; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] -; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 -; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] -; RV64-UF2: [[VECTOR_MEMCHECK]]: -; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 -; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 -; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]] -; RV64-UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]] -; RV64-UF2-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] -; RV64-UF2: [[VECTOR_PH]]: -; RV64-UF2-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8 -; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] -; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 -; RV64-UF2-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 -; RV64-UF2-NEXT: [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i32 [[N]], [[DOTCAST]] -; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64-UF2: [[VECTOR_BODY]]: -; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-UF2-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 -; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP18]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP18]], 1 -; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP25]] -; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP18]] -; RV64-UF2-NEXT: [[TMP31:%.*]] = sub i64 [[TMP18]], 1 -; RV64-UF2-NEXT: [[TMP32:%.*]] = mul i64 -1, [[TMP31]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP30]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i64 [[TMP32]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP29]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP34]], align 4 -; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD4]]) -; RV64-UF2-NEXT: [[TMP35:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP36:%.*]] = add [[REVERSE5]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP38:%.*]] = mul i64 0, [[TMP18]] -; RV64-UF2-NEXT: [[TMP39:%.*]] = sub i64 [[TMP18]], 1 -; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i64 -1, [[TMP39]] -; RV64-UF2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[TMP38]] -; RV64-UF2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i64 [[TMP40]] -; RV64-UF2-NEXT: [[TMP43:%.*]] = mul i64 -1, [[TMP18]] -; RV64-UF2-NEXT: [[TMP44:%.*]] = sub i64 [[TMP18]], 1 -; RV64-UF2-NEXT: [[TMP45:%.*]] = mul i64 -1, [[TMP44]] -; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[TMP43]] -; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP46]], i64 [[TMP45]] -; RV64-UF2-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP35]]) -; RV64-UF2-NEXT: store [[REVERSE6]], ptr [[TMP42]], align 4 -; RV64-UF2-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP36]]) -; RV64-UF2-NEXT: store [[REVERSE7]], ptr [[TMP47]], align 4 -; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] -; RV64-UF2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; RV64-UF2: [[MIDDLE_BLOCK]]: -; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; RV64-UF2: [[SCALAR_PH]]: -; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV64-UF2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] -; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] -; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV64-UF2: [[FOR_COND_CLEANUP]]: -; RV64-UF2-NEXT: ret void -; RV64-UF2: [[FOR_BODY]]: ; entry: %cmp7 = icmp sgt i32 %n, 0 @@ -478,259 +225,6 @@ for.body: ; preds = %for.body.preheader, } define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) { -; RV64-LABEL: define void @vector_reverse_f32( -; RV64-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*:]] -; RV64-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 -; RV64-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 -; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV64: [[FOR_BODY_PREHEADER]]: -; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; RV64: [[VECTOR_SCEVCHECK]]: -; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 -; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 -; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) -; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] -; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] -; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 -; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] -; RV64: [[VECTOR_MEMCHECK]]: -; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 -; RV64-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 -; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]] -; RV64-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]] -; RV64-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] -; RV64: [[VECTOR_PH]]: -; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV64-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 -; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-NEXT: [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]] -; RV64-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64: [[VECTOR_BODY]]: -; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 -; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] -; RV64-NEXT: [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 -; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP22]] -; RV64-NEXT: [[TMP24:%.*]] = mul i64 0, [[TMP18]] -; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP18]], 1 -; RV64-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP25]] -; RV64-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP24]] -; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP26]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP28]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP29:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]] -; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP18]] -; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP18]], 1 -; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]] -; RV64-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP30]], i64 [[TMP31]] -; RV64-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP33]] -; RV64-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP29]]) -; RV64-NEXT: store [[REVERSE4]], ptr [[TMP35]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] -; RV64-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV64-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV64: [[FOR_COND_CLEANUP]]: -; RV64-NEXT: ret void -; RV64: [[FOR_BODY]]: -; -; RV32-LABEL: define void @vector_reverse_f32( -; RV32-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { -; RV32-NEXT: [[ENTRY:.*:]] -; RV32-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i32 -; RV32-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i32 -; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV32: [[FOR_BODY_PREHEADER]]: -; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] -; RV32: [[VECTOR_MEMCHECK]]: -; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() -; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4 -; RV32-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4 -; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]] -; RV32-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]] -; RV32-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] -; RV32: [[VECTOR_PH]]: -; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV32-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4 -; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV32-NEXT: [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]] -; RV32-NEXT: br label %[[VECTOR_BODY:.*]] -; RV32: [[VECTOR_BODY]]: -; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV32-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 -; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] -; RV32-NEXT: [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 -; RV32-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 -; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; RV32-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32 -; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP16]] -; RV32-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], 1 -; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP18]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 [[TMP17]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP22:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; RV32-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32 -; RV32-NEXT: [[TMP25:%.*]] = mul i32 0, [[TMP24]] -; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], 1 -; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]] -; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP25]] -; RV32-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i32 [[TMP27]] -; RV32-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP22]]) -; RV32-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV32-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] -; RV32-NEXT: br label %[[FOR_BODY:.*]] -; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV32: [[FOR_COND_CLEANUP]]: -; RV32-NEXT: ret void -; RV32: [[FOR_BODY]]: -; -; RV64-UF2-LABEL: define void @vector_reverse_f32( -; RV64-UF2-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { -; RV64-UF2-NEXT: [[ENTRY:.*:]] -; RV64-UF2-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 -; RV64-UF2-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 -; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV64-UF2: [[FOR_BODY_PREHEADER]]: -; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8 -; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; RV64-UF2: [[VECTOR_SCEVCHECK]]: -; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 -; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 -; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) -; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] -; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] -; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 -; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] -; RV64-UF2: [[VECTOR_MEMCHECK]]: -; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 -; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 -; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]] -; RV64-UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]] -; RV64-UF2-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] -; RV64-UF2: [[VECTOR_PH]]: -; RV64-UF2-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8 -; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] -; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 -; RV64-UF2-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 -; RV64-UF2-NEXT: [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i32 [[N]], [[DOTCAST]] -; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64-UF2: [[VECTOR_BODY]]: -; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-UF2-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 -; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP18]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP18]], 1 -; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP25]] -; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP18]] -; RV64-UF2-NEXT: [[TMP31:%.*]] = sub i64 [[TMP18]], 1 -; RV64-UF2-NEXT: [[TMP32:%.*]] = mul i64 -1, [[TMP31]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP30]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP33]], i64 [[TMP32]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP29]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP34]], align 4 -; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD4]]) -; RV64-UF2-NEXT: [[TMP35:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP36:%.*]] = fadd [[REVERSE5]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP38:%.*]] = mul i64 0, [[TMP18]] -; RV64-UF2-NEXT: [[TMP39:%.*]] = sub i64 [[TMP18]], 1 -; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i64 -1, [[TMP39]] -; RV64-UF2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[TMP38]] -; RV64-UF2-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP40]] -; RV64-UF2-NEXT: [[TMP43:%.*]] = mul i64 -1, [[TMP18]] -; RV64-UF2-NEXT: [[TMP44:%.*]] = sub i64 [[TMP18]], 1 -; RV64-UF2-NEXT: [[TMP45:%.*]] = mul i64 -1, [[TMP44]] -; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[TMP43]] -; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP46]], i64 [[TMP45]] -; RV64-UF2-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP35]]) -; RV64-UF2-NEXT: store [[REVERSE6]], ptr [[TMP42]], align 4 -; RV64-UF2-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP36]]) -; RV64-UF2-NEXT: store [[REVERSE7]], ptr [[TMP47]], align 4 -; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] -; RV64-UF2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; RV64-UF2: [[MIDDLE_BLOCK]]: -; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; RV64-UF2: [[SCALAR_PH]]: -; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV64-UF2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] -; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] -; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV64-UF2: [[FOR_COND_CLEANUP]]: -; RV64-UF2-NEXT: ret void -; RV64-UF2: [[FOR_BODY]]: ; entry: %cmp7 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll index deb81099ac676..e24c5a1a2bf2d 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -27,7 +27,6 @@ for.end: ; CHECK: LV: Scalarizing: %tmp1 = load i32, ptr %tmp0, align 4 ; CHECK: LV: Scalarizing: store i32 %tmp2, ptr %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, ptr %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; CHECK: Cost of 4 for VF 4: REPLICATE ir<%tmp1> = load ir<%tmp0> +; CHECK: Cost of 4 for VF 4: REPLICATE store ir<%tmp2>, ir<%tmp0> } -