From 583a9a3621896b6e7c09fe3d27b1c68bcc0f187b Mon Sep 17 00:00:00 2001
From: Matt Corallo <git@bluematt.me>
Date: Mon, 14 Jul 2025 20:31:22 +0000
Subject: [PATCH] Ensure partial MPP claims continue to blocks channels on
 restart

In 9cc6e08965c8d326b16a3dd9af9e33308f066770, we started using the
`RAAMonitorUpdateBlockingAction` logic to block RAAs which may
remove a preimage from one `ChannelMonitor` if it isn't durably
stored in another that is a part of the same MPP claim.

Then, in 254b78fd3553c977a00f9b81cf829e81427b0a04, when we claimed
a payment, if we saw that the HTLC was already claimed in the
channel, we'd simply drop the new RAA blocker. This can happen on
reload when replaying MPP claims.

However, just because an HTLC is no longer present in
`ChannelManager`'s `Channel`, doesn't mean that the
`ChannelMonitorUpdate` which stored the preimage actually made it
durably into the `ChannelMonitor` on disk.

We could begin an MPP payment, have one channel get the preimage
durably into its `ChannelMonitor`, then step forward another update
with the peer. Then, we could reload, causing the MPP claims to be
replayed across all chanels, leading to the RAA blocker(s) being
dropped and all channels being unlocked. Finally, if the first
channel managed to step forward a further update with its peer
before the (now-replayed) `ChannelMonitorUpdate`s for all MPP parts
make it to disk we could theoretically lose the preimage.

This is, of course, a somewhat comically unlikely scenario, but I
had an old note to expand the test and it turned up the issue, so
we might as well fix it.
---
 lightning/src/ln/chanmon_update_fail_tests.rs | 119 +++++++++++++++++-
 lightning/src/ln/channelmanager.rs            |  41 ++++--
 2 files changed, 145 insertions(+), 15 deletions(-)

diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs
index 878608e14f6..0a8f258935c 100644
--- a/lightning/src/ln/chanmon_update_fail_tests.rs
+++ b/lightning/src/ln/chanmon_update_fail_tests.rs
@@ -4218,13 +4218,17 @@ fn test_glacial_peer_cant_hang() {
 	do_test_glacial_peer_cant_hang(true);
 }
 
-#[test]
-fn test_partial_claim_mon_update_compl_actions() {
+fn do_test_partial_claim_mon_update_compl_actions(reload_a: bool, reload_b: bool) {
 	// Test that if we have an MPP claim that we ensure the preimage for the claim is retained in
 	// all the `ChannelMonitor`s until the preimage reaches every `ChannelMonitor` for a channel
 	// which was a part of the MPP.
 	let chanmon_cfgs = create_chanmon_cfgs(4);
 	let node_cfgs = create_node_cfgs(4, &chanmon_cfgs);
+
+	let (persister, persister_2, persister_3);
+	let (new_chain_mon, new_chain_mon_2, new_chain_mon_3);
+	let (nodes_3_reload, nodes_3_reload_2, nodes_3_reload_3);
+
 	let node_chanmgrs = create_node_chanmgrs(4, &node_cfgs, &[None, None, None, None]);
 	let mut nodes = create_network(4, &node_cfgs, &node_chanmgrs);
 
@@ -4253,6 +4257,8 @@ fn test_partial_claim_mon_update_compl_actions() {
 	let paths = &[&[&nodes[1], &nodes[3]][..], &[&nodes[2], &nodes[3]][..]];
 	send_along_route_with_secret(&nodes[0], route, paths, 200_000, payment_hash, payment_secret);
 
+	// Store the monitor for channel 4 without the preimage to use on reload
+	let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
 	// Claim along both paths, but only complete one of the two monitor updates.
 	chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
 	chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
@@ -4264,7 +4270,13 @@ fn test_partial_claim_mon_update_compl_actions() {
 	// Complete the 1<->3 monitor update and play the commitment_signed dance forward until it
 	// blocks.
 	nodes[3].chain_monitor.complete_sole_pending_chan_update(&chan_3_id);
-	expect_payment_claimed!(&nodes[3], payment_hash, 200_000);
+	let payment_claimed = nodes[3].node.get_and_clear_pending_events();
+	assert_eq!(payment_claimed.len(), 1, "{payment_claimed:?}");
+	if let Event::PaymentClaimed { payment_hash: ev_hash, .. } = &payment_claimed[0] {
+		assert_eq!(*ev_hash, payment_hash);
+	} else {
+		panic!("{payment_claimed:?}");
+	}
 	let mut updates = get_htlc_update_msgs(&nodes[3], &node_b_id);
 
 	nodes[1].node.handle_update_fulfill_htlc(node_d_id, updates.update_fulfill_htlcs.remove(0));
@@ -4283,15 +4295,41 @@ fn test_partial_claim_mon_update_compl_actions() {
 	check_added_monitors(&nodes[3], 0);
 	assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
 
+	if reload_a {
+		// After a reload (with the monitor not yet fully updated), the RAA should still be blocked
+		// waiting until the monitor update completes.
+		let node_ser = nodes[3].node.encode();
+		let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
+		let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
+		reload_node!(nodes[3], &node_ser, mons, persister, new_chain_mon, nodes_3_reload);
+		// The final update to channel 4 should be replayed.
+		persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
+		assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
+		check_added_monitors(&nodes[3], 1);
+
+		// Because the HTLCs aren't yet cleared, the PaymentClaimed event will be replayed on
+		// restart.
+		let second_payment_claimed = nodes[3].node.get_and_clear_pending_events();
+		assert_eq!(payment_claimed, second_payment_claimed);
+
+		nodes[1].node.peer_disconnected(node_d_id);
+		nodes[2].node.peer_disconnected(node_d_id);
+		reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
+		reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
+
+		assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
+	}
+
 	// Now double-check that the preimage is still in the 1<->3 channel and complete the pending
 	// monitor update, allowing node 3 to claim the payment on the 2<->3 channel. This also
 	// unblocks the 1<->3 channel, allowing node 3 to release the two blocked monitor updates and
 	// respond to the final commitment_signed.
 	assert!(get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
+	assert!(nodes[3].node.get_and_clear_pending_events().is_empty());
 
 	nodes[3].chain_monitor.complete_sole_pending_chan_update(&chan_4_id);
 	let mut ds_msgs = nodes[3].node.get_and_clear_pending_msg_events();
-	assert_eq!(ds_msgs.len(), 2);
+	assert_eq!(ds_msgs.len(), 2, "{ds_msgs:?}");
 	check_added_monitors(&nodes[3], 2);
 
 	match remove_first_msg_event_to_node(&node_b_id, &mut ds_msgs) {
@@ -4335,13 +4373,86 @@ fn test_partial_claim_mon_update_compl_actions() {
 	assert!(get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
 	assert!(get_monitor!(nodes[3], chan_4_id).get_stored_preimages().contains_key(&payment_hash));
 
+	if reload_b {
+		// Ensure that the channel pause logic doesn't accidentally get restarted after a second
+		// reload once the HTLCs for the first payment have been removed and the monitors
+		// completed.
+		let node_ser = nodes[3].node.encode();
+		let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
+		let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
+		let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
+		reload_node!(nodes[3], &node_ser, mons, persister_2, new_chain_mon_2, nodes_3_reload_2);
+		check_added_monitors(&nodes[3], 0);
+
+		nodes[1].node.peer_disconnected(node_d_id);
+		nodes[2].node.peer_disconnected(node_d_id);
+		reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
+		reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
+
+		assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
+
+		// Because the HTLCs aren't yet cleared, the PaymentClaimed event will be replayed on
+		// restart.
+		let third_payment_claimed = nodes[3].node.get_and_clear_pending_events();
+		assert_eq!(payment_claimed, third_payment_claimed);
+	}
+
 	send_payment(&nodes[1], &[&nodes[3]], 100_000);
 	assert!(!get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
 
+	if reload_b {
+		// Ensure that the channel pause logic doesn't accidentally get restarted after a second
+		// reload once the HTLCs for the first payment have been removed and the monitors
+		// completed, even if only one of the two monitors still knows about the first payment.
+		let node_ser = nodes[3].node.encode();
+		let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
+		let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
+		let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
+		reload_node!(nodes[3], &node_ser, mons, persister_3, new_chain_mon_3, nodes_3_reload_3);
+		check_added_monitors(&nodes[3], 0);
+
+		nodes[1].node.peer_disconnected(node_d_id);
+		nodes[2].node.peer_disconnected(node_d_id);
+		reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
+		reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
+
+		assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
+
+		// Because the HTLCs aren't yet cleared, the PaymentClaimed events for both payments will
+		// be replayed on restart.
+		// Use this as an opportunity to check the payment_ids are unique.
+		let mut events = nodes[3].node.get_and_clear_pending_events();
+		assert_eq!(events.len(), 2);
+		events.retain(|ev| *ev != payment_claimed[0]);
+		assert_eq!(events.len(), 1);
+		if let Event::PaymentClaimed { payment_id: original_payment_id, .. } = &payment_claimed[0] {
+			assert!(original_payment_id.is_some());
+			if let Event::PaymentClaimed { amount_msat, payment_id, .. } = &events[0] {
+				assert!(payment_id.is_some());
+				assert_ne!(original_payment_id, payment_id);
+				assert_eq!(*amount_msat, 100_000);
+			} else {
+				panic!("{events:?}");
+			}
+		} else {
+			panic!("{events:?}");
+		}
+
+		send_payment(&nodes[1], &[&nodes[3]], 100_000);
+	}
+
 	send_payment(&nodes[2], &[&nodes[3]], 100_000);
 	assert!(!get_monitor!(nodes[3], chan_4_id).get_stored_preimages().contains_key(&payment_hash));
 }
 
+#[test]
+fn test_partial_claim_mon_update_compl_actions() {
+	do_test_partial_claim_mon_update_compl_actions(true, true);
+	do_test_partial_claim_mon_update_compl_actions(true, false);
+	do_test_partial_claim_mon_update_compl_actions(false, true);
+	do_test_partial_claim_mon_update_compl_actions(false, false);
+}
+
 #[test]
 fn test_claim_to_closed_channel_blocks_forwarded_preimage_removal() {
 	// One of the last features for async persistence we implemented was the correct blocking of
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
index 6f411273aab..7c6eb765204 100644
--- a/lightning/src/ln/channelmanager.rs
+++ b/lightning/src/ln/channelmanager.rs
@@ -8274,18 +8274,21 @@ where
 								// payment claim from a `ChannelMonitor`. In some cases (MPP or
 								// if the HTLC was only recently removed) we make such claims
 								// after an HTLC has been removed from a channel entirely, and
-								// thus the RAA blocker has long since completed.
+								// thus the RAA blocker may have long since completed.
 								//
-								// In any other case, the RAA blocker must still be present and
-								// blocking RAAs.
-								debug_assert!(
-									during_init
-										|| peer_state
-											.actions_blocking_raa_monitor_updates
-											.get(&chan_id)
-											.unwrap()
-											.contains(&raa_blocker)
-								);
+								// However, its possible that the `ChannelMonitorUpdate` containing
+								// the preimage never completed and is still pending. In that case,
+								// we need to re-add the RAA blocker, which we do here. Handling
+								// the post-update action, below, will remove it again.
+								//
+								// In any other case (i.e. not during startup), the RAA blocker
+								// must still be present and blocking RAAs.
+								let actions = &mut peer_state.actions_blocking_raa_monitor_updates;
+								let actions_list = actions.entry(chan_id).or_insert_with(Vec::new);
+								if !actions_list.contains(&raa_blocker) {
+									debug_assert!(during_init);
+									actions_list.push(raa_blocker);
+								}
 							}
 							let action = if let Some(action) = action_opt {
 								action
@@ -8293,6 +8296,22 @@ where
 								return;
 							};
 
+							// If there are monitor updates in flight, we may be in the case
+							// described above, replaying a claim on startup which needs an RAA
+							// blocker to remain blocked. Thus, in such a case we simply push the
+							// post-update action to the blocked list and move on.
+							// In any case, we should err on the side of caution and not process
+							// the post-update action no matter the situation.
+							let in_flight_mons = peer_state.in_flight_monitor_updates.get(&chan_id);
+							if in_flight_mons.map(|(_, mons)| !mons.is_empty()).unwrap_or(false) {
+								peer_state
+									.monitor_update_blocked_actions
+									.entry(chan_id)
+									.or_insert_with(Vec::new)
+									.push(action);
+								return;
+							}
+
 							mem::drop(peer_state_opt);
 
 							log_trace!(logger, "Completing monitor update completion action for channel {} as claim was redundant: {:?}",