Skip to content

Commit 48834cf

Browse files
committed
c/partition_balancer: track detailed information about failed reallocations
When partition replicas area moved in the cluster from whichever reason Redpanda should expose all information required to identify issues experienced during that operation. Previously partition balancer planner included only the basic information about the failures in the status. That was the list of `ntps` that replicas failed to be reallocated and the total number of reallocations failures. In order to obtain more detailed information looking into the partition balancer log was required. Moreover the reallocations that were not even attempted during preparing an allocation plan were not reported at all. This lead to confusion as f.e. partitions with only one replica on the node that is offline that can not be moved was not reported ans the failed one. In order to address this problem we introduced a new `allocation_failure_details` map in the partition balancer planner result. The map provides a detailed information about the failed reallocation. Additionally reallocation failure is reported when partition is marked as `immutable` in the process of planning balancer action. This makes it easy for the operator to identify any partition balancer issues without the need to look for the specific log entries. The change is fully backward compatible as all the changes are incremental i.e. when no detailed information about failures is available Redpanda will fallback to the old basic `ntp` list. Signed-off-by: Michał Maślanka <[email protected]>
1 parent 936098d commit 48834cf

12 files changed

+324
-45
lines changed

src/v/cluster/partition_balancer_backend.cc

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -414,8 +414,8 @@ ss::future<> partition_balancer_backend::do_tick() {
414414

415415
_cur_term->last_tick_time = clock_t::now();
416416
_cur_term->last_violations = std::move(plan_data.violations);
417-
_cur_term->last_tick_decommission_realloc_failures = std::move(
418-
plan_data.decommission_realloc_failures);
417+
_cur_term->last_tick_reallocation_failures = std::move(
418+
plan_data.reallocation_failures);
419419
if (
420420
_state.topics().has_updates_in_progress()
421421
|| plan_data.status == planner_status::actions_planned) {
@@ -571,8 +571,7 @@ partition_balancer_overview_reply partition_balancer_backend::overview() const {
571571

572572
ret.status = _cur_term->last_status;
573573
ret.violations = _cur_term->last_violations;
574-
ret.decommission_realloc_failures
575-
= _cur_term->last_tick_decommission_realloc_failures;
574+
ret.set_reallocation_failures(_cur_term->last_tick_reallocation_failures);
576575
ret.partitions_pending_force_recovery_count
577576
= _state.topics().partitions_to_force_recover().size();
578577
if (ret.partitions_pending_force_recovery_count > 0) {

src/v/cluster/partition_balancer_backend.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ class partition_balancer_backend {
139139
= partition_balancer_status::starting;
140140
size_t last_tick_in_progress_updates = 0;
141141

142-
absl::flat_hash_map<model::node_id, absl::btree_set<model::ntp>>
143-
last_tick_decommission_realloc_failures;
142+
chunked_hash_map<model::ntp, reallocation_failure_details>
143+
last_tick_reallocation_failures;
144144

145145
bool _ondemand_rebalance_requested = false;
146146
bool _force_health_report_refresh = false;

src/v/cluster/partition_balancer_planner.cc

Lines changed: 128 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,33 @@ distinct_from(const absl::flat_hash_set<model::node_id>& nodes) {
6666
return hard_constraint(std::make_unique<impl>(nodes));
6767
}
6868

69+
reallocation_failure_details map_result_to_failure_details(
70+
change_reason reason, std::error_code ec, model::node_id replica_to_move) {
71+
reallocation_failure_details details{
72+
.replica_to_move = replica_to_move,
73+
.reason = reason,
74+
.error = details.error = reallocation_error::unknown_error};
75+
if (ec.category() == cluster::error_category()) {
76+
switch (static_cast<cluster::errc>(ec.value())) {
77+
case cluster::errc::no_eligible_allocation_nodes:
78+
details.error = reallocation_error::no_eligible_node_found;
79+
break;
80+
case cluster::errc::topic_invalid_partitions_fd_limit:
81+
details.error = reallocation_error::over_partition_fd_limit;
82+
break;
83+
case cluster::errc::topic_invalid_partitions_core_limit:
84+
details.error = reallocation_error::over_partition_core_limit;
85+
break;
86+
case cluster::errc::topic_invalid_partitions_memory_limit:
87+
details.error = reallocation_error::over_partition_memory_limit;
88+
break;
89+
default:
90+
break;
91+
}
92+
}
93+
94+
return details;
95+
}
6996
} // namespace
7097

7198
partition_balancer_planner::partition_balancer_planner(
@@ -187,8 +214,8 @@ class partition_balancer_planner::request_context {
187214

188215
void increment_missing_size_count() { _partitions_with_missing_size++; }
189216

190-
void
191-
report_decommission_reallocation_failure(model::node_id, const model::ntp&);
217+
void report_reallocation_failure(
218+
const model::ntp&, reallocation_failure_details);
192219

193220
node2count_t& get_topic_to_node_count(model::topic_namespace_view tp_ns) {
194221
auto it = _topic2node_counts.find(tp_ns);
@@ -240,10 +267,9 @@ class partition_balancer_planner::request_context {
240267
// we track missing partition size info separately as it requires force
241268
// refresh of health report
242269
size_t _partitions_with_missing_size = 0;
243-
// Tracks ntps with allocation failures grouped by decommissioning node.
244-
static constexpr size_t max_ntps_with_reallocation_falures = 25;
245-
absl::flat_hash_map<model::node_id, absl::btree_set<model::ntp>>
246-
_decommission_realloc_failures;
270+
static constexpr size_t max_reallocation_failures_reported = 25;
271+
chunked_hash_map<model::ntp, reallocation_failure_details>
272+
_reallocation_failures;
247273
absl::node_hash_set<model::ntp> _cancellations;
248274
bool _counts_rebalancing_finished = false;
249275
ss::abort_source& _as;
@@ -544,13 +570,13 @@ bool partition_balancer_planner::request_context::increment_failure_count() {
544570
}
545571
}
546572

547-
void partition_balancer_planner::request_context::
548-
report_decommission_reallocation_failure(
549-
model::node_id node, const model::ntp& ntp) {
550-
auto& ntps = _decommission_realloc_failures.try_emplace(node).first->second;
551-
if (ntps.size() < max_ntps_with_reallocation_falures) {
552-
ntps.emplace(ntp);
573+
void partition_balancer_planner::request_context::report_reallocation_failure(
574+
const model::ntp& ntp, reallocation_failure_details details) {
575+
if (_reallocation_failures.size() >= max_reallocation_failures_reported) {
576+
return;
553577
}
578+
579+
_reallocation_failures.try_emplace(ntp, details);
554580
}
555581

556582
static bool has_quorum(
@@ -821,6 +847,54 @@ class partition_balancer_planner::immutable_partition {
821847
change_reason);
822848
}
823849

850+
bool contains_replica(model::node_id replica) const {
851+
return contains_node(_replicas, replica);
852+
}
853+
854+
reallocation_failure_details map_immutability_reason_to_failure_details(
855+
change_reason reason, model::node_id replica_to_move) {
856+
reallocation_failure_details details{
857+
.replica_to_move = replica_to_move,
858+
.reason = reason,
859+
.error = details.error = reallocation_error::unknown_error};
860+
861+
switch (_reason) {
862+
case immutability_reason::no_quorum:
863+
details.error = reallocation_error::no_quorum;
864+
break;
865+
case immutability_reason::no_size_info:
866+
details.error = reallocation_error::missing_partition_size_info;
867+
break;
868+
case immutability_reason::reconfiguration_state:
869+
details.error = reallocation_error::reconfiguration_in_progress;
870+
break;
871+
case immutability_reason::disabled:
872+
details.error = reallocation_error::partition_disabled;
873+
break;
874+
case immutability_reason::batch_full:
875+
break;
876+
}
877+
878+
return details;
879+
}
880+
881+
void report_immutable_partition_as_reallocation_failure(
882+
request_context& ctx,
883+
const std::vector<model::node_id>& replicas_to_move,
884+
change_reason reason) {
885+
if (_reason == immutability_reason::batch_full) {
886+
return;
887+
}
888+
// report per replica reallocation failures
889+
for (const auto replica : replicas_to_move) {
890+
if (contains_replica(replica)) {
891+
ctx.report_reallocation_failure(
892+
ntp(),
893+
map_immutability_reason_to_failure_details(reason, replica));
894+
}
895+
}
896+
}
897+
824898
private:
825899
friend class request_context;
826900

@@ -1497,8 +1571,10 @@ ss::future<> partition_balancer_planner::get_node_drain_actions(
14971571
ctx.config().hard_max_disk_usage_ratio,
14981572
reason);
14991573
if (!result) {
1500-
ctx.report_decommission_reallocation_failure(
1501-
replica, part.ntp());
1574+
ctx.report_reallocation_failure(
1575+
part.ntp(),
1576+
map_result_to_failure_details(
1577+
reason, result.error(), replica));
15021578
}
15031579
}
15041580
}
@@ -1522,7 +1598,11 @@ ss::future<> partition_balancer_planner::get_node_drain_actions(
15221598
}
15231599
},
15241600
[&](force_reassignable_partition&) {},
1525-
[&](immutable_partition& part) { part.report_failure(reason); });
1601+
[&](immutable_partition& part) {
1602+
part.report_failure(reason);
1603+
part.report_immutable_partition_as_reallocation_failure(
1604+
ctx, to_move, reason);
1605+
});
15261606

15271607
return ss::stop_iteration::no;
15281608
});
@@ -1589,15 +1669,25 @@ ss::future<> partition_balancer_planner::get_rack_constraint_repair_actions(
15891669
if (part.is_original(replica)) {
15901670
// only move replicas that haven't been moved for
15911671
// other reasons
1592-
(void)part.move_replica(
1672+
auto r = part.move_replica(
15931673
replica,
15941674
ctx.config().soft_max_disk_usage_ratio,
15951675
change_reason::rack_constraint_repair);
1676+
if (r.has_error()) {
1677+
ctx.report_reallocation_failure(
1678+
part.ntp(),
1679+
map_result_to_failure_details(
1680+
change_reason::rack_constraint_repair,
1681+
r.error(),
1682+
replica));
1683+
}
15961684
}
15971685
}
15981686
},
1599-
[](immutable_partition& part) {
1687+
[&](immutable_partition& part) {
16001688
part.report_failure(change_reason::rack_constraint_repair);
1689+
part.report_immutable_partition_as_reallocation_failure(
1690+
ctx, to_move, change_reason::rack_constraint_repair);
16011691
},
16021692
[](moving_partition&) {},
16031693
[](force_reassignable_partition&) {});
@@ -1814,12 +1904,23 @@ partition_balancer_planner::get_full_node_actions(request_context& ctx) {
18141904
});
18151905

18161906
for (const auto& replica : full_node_replicas) {
1817-
(void)part.move_replica(
1907+
auto r = part.move_replica(
18181908
replica.node_id,
18191909
ctx.config().soft_max_disk_usage_ratio,
18201910
change_reason::disk_full);
1911+
if (r.has_error()) {
1912+
ctx.report_reallocation_failure(
1913+
part.ntp(),
1914+
map_result_to_failure_details(
1915+
change_reason::disk_full,
1916+
r.error(),
1917+
replica.node_id));
1918+
}
18211919
}
18221920
},
1921+
[&](immutable_partition& part) {
1922+
part.report_failure(change_reason::disk_full);
1923+
},
18231924
[](auto&) {});
18241925
});
18251926
}
@@ -1940,6 +2041,12 @@ ss::future<> partition_balancer_planner::get_counts_rebalancing_actions(
19402041
ctx.config().soft_max_disk_usage_ratio,
19412042
change_reason::partition_count_rebalancing);
19422043
if (!res) {
2044+
ctx.report_reallocation_failure(
2045+
part.ntp(),
2046+
map_result_to_failure_details(
2047+
change_reason::partition_count_rebalancing,
2048+
res.error(),
2049+
node));
19432050
return;
19442051
}
19452052

@@ -1959,6 +2066,8 @@ ss::future<> partition_balancer_planner::get_counts_rebalancing_actions(
19592066
},
19602067
[&](immutable_partition& p) {
19612068
p.report_failure(change_reason::partition_count_rebalancing);
2069+
p.report_immutable_partition_as_reallocation_failure(
2070+
ctx, {node}, change_reason::partition_count_rebalancing);
19622071
should_stop = false;
19632072
},
19642073
[](auto&) {});
@@ -2048,8 +2157,7 @@ void partition_balancer_planner::request_context::collect_actions(
20482157

20492158
result.counts_rebalancing_finished = _counts_rebalancing_finished;
20502159

2051-
result.decommission_realloc_failures = std::move(
2052-
_decommission_realloc_failures);
2160+
result.reallocation_failures = std::move(_reallocation_failures);
20532161

20542162
if (
20552163
!result.cancellations.empty() || !result.reassignments.empty()

src/v/cluster/partition_balancer_planner.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,13 @@ class partition_balancer_planner {
8484
partition_balancer_violations violations;
8585
std::vector<ntp_reassignment> reassignments;
8686
std::vector<model::ntp> cancellations;
87-
absl::flat_hash_map<model::node_id, absl::btree_set<model::ntp>>
88-
decommission_realloc_failures;
87+
chunked_hash_map<model::ntp, reallocation_failure_details>
88+
reallocation_failures;
8989
bool counts_rebalancing_finished = false;
9090
size_t failed_actions_count = 0;
9191
status status = status::empty;
92+
93+
void maybe_add_reallocation_failure();
9294
};
9395

9496
ss::future<plan_data>

src/v/cluster/partition_balancer_types.cc

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,43 @@ partition_balancer_violations::partition_balancer_violations(
7575
: unavailable_nodes(std::move(un))
7676
, full_nodes(std::move(fn)) {}
7777

78+
void partition_balancer_overview_reply::set_reallocation_failures(
79+
const chunked_hash_map<model::ntp, reallocation_failure_details>&
80+
reallocations) {
81+
reallocation_failures.reserve(reallocations.size());
82+
for (const auto& [ntp, details] : reallocations) {
83+
reallocation_failures.emplace(ntp, details);
84+
/**
85+
* Fill in the decommission_realloc_failures map with the reallocation
86+
* failures for backward compatibility.
87+
*/
88+
if (details.reason == change_reason::node_decommissioning) {
89+
auto& failed_ntps
90+
= decommission_realloc_failures[details.replica_to_move];
91+
failed_ntps.insert(ntp);
92+
}
93+
}
94+
}
95+
96+
partition_balancer_overview_reply
97+
partition_balancer_overview_reply::copy() const {
98+
partition_balancer_overview_reply copy;
99+
copy.error = error;
100+
copy.last_tick_time = last_tick_time;
101+
copy.status = status;
102+
copy.violations = violations;
103+
copy.partitions_pending_force_recovery_count
104+
= partitions_pending_force_recovery_count;
105+
copy.partitions_pending_force_recovery_sample
106+
= partitions_pending_force_recovery_sample;
107+
copy.decommission_realloc_failures = decommission_realloc_failures;
108+
copy.reallocation_failures.reserve(reallocation_failures.size());
109+
for (const auto& [ntp, details] : reallocation_failures) {
110+
copy.reallocation_failures.emplace(ntp, details);
111+
}
112+
return copy;
113+
}
114+
78115
std::ostream&
79116
operator<<(std::ostream& o, const partition_balancer_violations& v) {
80117
fmt::print(
@@ -117,12 +154,16 @@ operator<<(std::ostream& o, const partition_balancer_overview_reply& rep) {
117154
fmt::print(
118155
o,
119156
"{{ error: {} last_tick_time: {} status: {} violations: {}, "
120-
"partitions_pending_force_recovery: {}}}",
157+
"partitions_pending_force_recovery: {}, "
158+
"decommission_reallocation_failures_count: {}, reallocation_failures: "
159+
"{}}}",
121160
rep.error,
122161
rep.last_tick_time,
123162
rep.status,
124163
rep.violations,
125-
rep.partitions_pending_force_recovery_count);
164+
rep.partitions_pending_force_recovery_count,
165+
rep.decommission_realloc_failures.size(),
166+
fmt::join(rep.reallocation_failures | std::views::values, ", "));
126167
return o;
127168
}
128169

0 commit comments

Comments
 (0)