llvm · SergeyShch01 · Jul 8, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -200,11 +200,16 @@ static cl::opt<bool> DropScaledForVScale(
     "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
     cl::desc("Avoid using scaled registers with vscale-relative addressing"));
 
+static cl::opt<unsigned> MinExpensiveFromulaeNum(
+    "lsr-min-expensive-formula-num", cl::Hidden, cl::init(16),
+    cl::desc(
+        "Minimum number of use formulae to consider their study expensive"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
-static cl::opt<bool> StressIVChain(
-  "stress-ivchain", cl::Hidden, cl::init(false),
-  cl::desc("Stress test LSR IV chains"));
+static cl::opt<bool> StressIVChain("stress-ivchain", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("Stress test LSR IV chains"));
 #else
 static bool StressIVChain = false;
 #endif
@@ -2251,6 +2256,7 @@ class LSRInstance {
   void NarrowSearchSpaceByDeletingCostlyFormulas();
   void NarrowSearchSpaceByPickingWinnerRegs();
   void NarrowSearchSpaceUsingHeuristics();
+  bool SortLSRUses();
 
   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                     Cost &SolutionCost,
@@ -5395,6 +5401,58 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
     NarrowSearchSpaceByPickingWinnerRegs();
 }
 
+/// Sort LSRUses to address side effects of compile time optimization done in
+/// SolveRecurse which filters out formulae not including required registers.
+/// Such optimization makes the found best solution sensitive to the order
+/// of LSRUses processing, hence it's important to ensure that that order
+/// isn't random to avoid fluctuations and sub-optimal results.
+///
+/// Also check that all LSRUses have formulae as otherwise the situation is
+/// unsolvable.
+bool LSRInstance::SortLSRUses() {
+  SmallVector<LSRUse *, 16> NewOrder;
+  for (LSRUse &LU : Uses) {
+    if (LU.Formulae.empty()) {
+      return false;
+    }
+    NewOrder.push_back(&LU);
+  }
+
+  stable_sort(NewOrder, [](const LSRUse *L, const LSRUse *R) {
+    auto CalcKey = [](const LSRUse *LU) {
+      return std::tuple(LU->Regs.size(), LU->Formulae.size(), LU->Kind,
+                        LU->MinOffset.getKnownMinValue(),
+                        LU->MaxOffset.getKnownMinValue(),
+                        LU->AllFixupsOutsideLoop, LU->RigidFormula);
+    };
+
+    bool LExpensive = L->Formulae.size() >= MinExpensiveFromulaeNum;
+    bool RExpensive = R->Formulae.size() >= MinExpensiveFromulaeNum;
+
+    // If there are too many forlmulae then LSRUses w/ less formulae
+    // go first to save compilation time
+    if (LExpensive != RExpensive) {
+      return RExpensive;
+    }
+    if (LExpensive) {
+      return L->Formulae.size() < R->Formulae.size();
+    }
+
+    // LSRUses w/ many registers and formulae go first to avoid too big
+    // reduction of considered solutions count
+    return CalcKey(L) > CalcKey(R);
+  });
+
+  SmallVector<LSRUse, 4> NewUses;
+  for (LSRUse *LU : NewOrder)
+    NewUses.push_back(std::move(*LU));
+  Uses = std::move(NewUses);
+
+  LLVM_DEBUG(dbgs() << "\nAfter sorting:\n"; print_uses(dbgs()));
+
+  return true;
+}
+
 /// This is the recursive solver.
 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                                Cost &SolutionCost,
@@ -5414,6 +5472,10 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
 
   const LSRUse &LU = Uses[Workspace.size()];
 
+  assert(!LU.Formulae.empty() &&
+         "LSRUse w/o formulae leads to unsolvable situation so it"
+         "shouldn't be here");
+
   // If this use references any register that's already a part of the
   // in-progress solution, consider it a requirement that a formula must
   // reference that register in order to be considered. This prunes out
@@ -5425,54 +5487,73 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
 
   SmallPtrSet<const SCEV *, 16> NewRegs;
   Cost NewCost(L, SE, TTI, AMK);
-  for (const Formula &F : LU.Formulae) {
-    // Ignore formulae which may not be ideal in terms of register reuse of
-    // ReqRegs.  The formula should use all required registers before
-    // introducing new ones.
-    // This can sometimes (notably when trying to favour postinc) lead to
-    // sub-optimial decisions. There it is best left to the cost modelling to
-    // get correct.
-    if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
-      int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
-      for (const SCEV *Reg : ReqRegs) {
-        if ((F.ScaledReg && F.ScaledReg == Reg) ||
-            is_contained(F.BaseRegs, Reg)) {
-          --NumReqRegsToFind;
-          if (NumReqRegsToFind == 0)
-            break;
+  bool FormulaeTested = false;
+  unsigned NumReqRegsToIgnore = 0;
+
+  while (!FormulaeTested) {
+    assert(
+        !NumReqRegsToIgnore ||
+        NumReqRegsToIgnore < ReqRegs.size() &&
+            "at least one formulae should have at least one required register");
+
+    for (const Formula &F : LU.Formulae) {
+      // ReqRegs. The formula should use required registers before
+      // introducing new ones. Firstly try the most aggressive option
+      // (when maximum of required registers are used) and then gradually make
+      // it weaker if all formulae don't satisfy this requirement.
+      //
+      // This can sometimes (notably when trying to favour postinc) lead to
+      // sub-optimal decisions. There it is best left to the cost modeling to
+      // get correct.
+      if (!ReqRegs.empty() &&
+          (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address)) {
+        unsigned NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+        bool ReqRegsFound = false;
+        for (const SCEV *Reg : ReqRegs) {
+          if ((F.ScaledReg && F.ScaledReg == Reg) ||
+              is_contained(F.BaseRegs, Reg)) {
+            ReqRegsFound = true;
+            if (--NumReqRegsToFind == NumReqRegsToIgnore)
+              break;
+          }
+        }
+        if (!ReqRegsFound || NumReqRegsToFind != NumReqRegsToIgnore) {
+          continue;
         }
       }
-      if (NumReqRegsToFind != 0) {
-        // If none of the formulae satisfied the required registers, then we could
-        // clear ReqRegs and try again. Currently, we simply give up in this case.
-        continue;
-      }
-    }
 
-    // Evaluate the cost of the current formula. If it's already worse than
-    // the current best, prune the search at that point.
-    NewCost = CurCost;
-    NewRegs = CurRegs;
-    NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
-    if (NewCost.isLess(SolutionCost)) {
-      Workspace.push_back(&F);
-      if (Workspace.size() != Uses.size()) {
-        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
-                     NewRegs, VisitedRegs);
-        if (F.getNumRegs() == 1 && Workspace.size() == 1)
-          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
-      } else {
-        LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
-                   dbgs() << ".\nRegs:\n";
-                   for (const SCEV *S : NewRegs) dbgs()
-                      << "- " << *S << "\n";
-                   dbgs() << '\n');
-
-        SolutionCost = NewCost;
-        Solution = Workspace;
+      // Evaluate the cost of the current formula. If it's already worse than
+      // the current best, prune the search at that point.
+      FormulaeTested = true;
+      NewCost = CurCost;
+      NewRegs = CurRegs;
+      NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
+      if (NewCost.isLess(SolutionCost)) {
+        Workspace.push_back(&F);
+        if (Workspace.size() != Uses.size()) {
+          SolveRecurse(Solution, SolutionCost, Workspace, NewCost, NewRegs,
+                       VisitedRegs);
+          if (F.getNumRegs() == 1 && Workspace.size() == 1)
+            VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
+        } else {
+          LLVM_DEBUG({
+            dbgs() << "New best at ";
+            NewCost.print(dbgs());
+            dbgs() << ".\nRegs:\n";
+            for (const SCEV *S : NewRegs)
+              dbgs() << "- " << *S << "\n";
+            dbgs() << '\n';
+          });
+          SolutionCost = NewCost;
+          Solution = Workspace;
+        }
+        Workspace.pop_back();
       }
-      Workspace.pop_back();
     }
+
+    // none of formulae has necessary number of required registers - then
+    // make the requirement weaker
+    NumReqRegsToIgnore++;
   }
 }
 
@@ -6213,7 +6294,11 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   NarrowSearchSpaceUsingHeuristics();
 
   SmallVector<const Formula *, 8> Solution;
-  Solve(Solution);
+  bool LSRUsesConsistent = SortLSRUses();
+
+  if (LSRUsesConsistent) {
+    Solve(Solution);
+  }
 
   // Release memory that is no longer needed.
   Factors.clear();

diff --git a/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
@@ -16,14 +16,14 @@ define i32 @a(i32 %x, ptr nocapture readonly %y, ptr nocapture readonly %z) {
 ; CHECK-IMPLICIT:    .p2align 5
 ; CHECK-NEXT:  .LBB0_8: // %for.body
 ; CHECK-OBJ;Disassembly of section .text:
-; CHECK-OBJ:               88: 8b0a002a      add
+; CHECK-OBJ:               88: 8b0b002b      add
 ; CHECK-OBJ-IMPLICIT-NEXT: 8c: d503201f      nop
 ; CHECK-OBJ-IMPLICIT-NEXT: 90: d503201f      nop
 ; CHECK-OBJ-IMPLICIT-NEXT: 94: d503201f      nop
 ; CHECK-OBJ-IMPLICIT-NEXT: 98: d503201f      nop
 ; CHECK-OBJ-IMPLICIT-NEXT: 9c: d503201f      nop
-; CHECK-OBJ-IMPLICIT-NEXT: a0: b840454b      ldr
-; CHECK-OBJ-EXPLICIT-NEXT: 8c: b840454b      ldr
+; CHECK-OBJ-IMPLICIT-NEXT: a0: b8404569      ldr
+; CHECK-OBJ-EXPLICIT-NEXT: 8c: b8404569      ldr
 entry:
   %cmp10 = icmp sgt i32 %x, 0
   br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup

diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -33,17 +33,17 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    b.eq .LBB0_8
 ; CHECK-NEXT:  .LBB0_6: // %for.body.preheader1
-; CHECK-NEXT:    lsl x10, x9, #1
+; CHECK-NEXT:    lsl x11, x9, #1
 ; CHECK-NEXT:    sub x8, x8, x9
-; CHECK-NEXT:    add x9, x1, x10
-; CHECK-NEXT:    add x10, x0, x10
+; CHECK-NEXT:    add x10, x1, x11
+; CHECK-NEXT:    add x11, x0, x11
 ; CHECK-NEXT:  .LBB0_7: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr h1, [x10], #2
-; CHECK-NEXT:    ldr h2, [x9]
+; CHECK-NEXT:    ldr h1, [x11], #2
+; CHECK-NEXT:    ldr h2, [x10]
 ; CHECK-NEXT:    subs x8, x8, #1
 ; CHECK-NEXT:    fmadd h1, h1, h0, h2
-; CHECK-NEXT:    str h1, [x9], #2
+; CHECK-NEXT:    str h1, [x10], #2
 ; CHECK-NEXT:    b.ne .LBB0_7
 ; CHECK-NEXT:  .LBB0_8: // %for.cond.cleanup
 ; CHECK-NEXT:    ret

diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
@@ -36,23 +36,23 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:  .LBB0_5: // %vector.ph
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    dup v0.8h, w15
-; CHECK-NEXT:    mov x16, x14
-; CHECK-NEXT:    mov x17, x13
-; CHECK-NEXT:    mov x18, x12
+; CHECK-NEXT:    mov x16, x12
+; CHECK-NEXT:    mov x17, x14
+; CHECK-NEXT:    mov x18, x13
 ; CHECK-NEXT:  .LBB0_6: // %vector.body
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldp q1, q4, [x16, #-16]
-; CHECK-NEXT:    subs x18, x18, #16
-; CHECK-NEXT:    ldp q3, q2, [x17, #-32]
-; CHECK-NEXT:    add x16, x16, #32
-; CHECK-NEXT:    ldp q6, q5, [x17]
+; CHECK-NEXT:    ldp q1, q4, [x17, #-16]
+; CHECK-NEXT:    subs x16, x16, #16
+; CHECK-NEXT:    ldp q3, q2, [x18, #-32]
+; CHECK-NEXT:    add x17, x17, #32
+; CHECK-NEXT:    ldp q6, q5, [x18]
 ; CHECK-NEXT:    smlal2 v2.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    smlal v3.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    smlal2 v5.4s, v0.8h, v4.8h
 ; CHECK-NEXT:    smlal v6.4s, v0.4h, v4.4h
-; CHECK-NEXT:    stp q3, q2, [x17, #-32]
-; CHECK-NEXT:    stp q6, q5, [x17], #64
+; CHECK-NEXT:    stp q3, q2, [x18, #-32]
+; CHECK-NEXT:    stp q6, q5, [x18], #64
 ; CHECK-NEXT:    b.ne .LBB0_6
 ; CHECK-NEXT:  // %bb.7: // %middle.block
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1