From 7c9b1c031b6b8a4f4d38de865cad9c55dcd4da7e Mon Sep 17 00:00:00 2001 From: Tushar Jain Date: Mon, 16 Jun 2025 15:15:42 -0700 Subject: [PATCH] fix diloco integration test Summary: - for diloco the model parameters, in the way they are saved by the test can be different across replicas - only the global parameters can be the same - fix the test to validate the global parameters are the same instead of the local model parameters Test Plan: ``` $ pytest -v ./torchft/local_sgd_integ_test.py::LocalSGDIntegTest::test_diloco_recovery_0 ``` --- torchft/local_sgd_integ_test.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/torchft/local_sgd_integ_test.py b/torchft/local_sgd_integ_test.py index 9bc7b5f3..dde67b52 100644 --- a/torchft/local_sgd_integ_test.py +++ b/torchft/local_sgd_integ_test.py @@ -414,13 +414,26 @@ def test_diloco_recovery(self, use_cuda: bool) -> None: rep0, rep1 = state_dicts for step in rep0.keys(): - # Inner optimizer will be different, outer optimizer and model should be the same + # Inner optimizer and local model parameters will be different e.g. + # with 2 replicas r1 and r2, we sync every 2 steps + # + # - Manager Step 1 + # - Step 1: r1 and r2 step + # - Step 2: r1 and r2 step, sync the model, quorum succeeds + # - Manager Step 2 + # - Step 1: r1 steps but r2 fails + # - Step 2: + # - r1 steps, sync fails because r2 is down + # - r1 recovers r2 from the model state at this step + # that is different from the model for r1 at the beginning + # of step Manager Step 2 + # + # Outer optimizer and global model should be the same + torch.testing.assert_close( - rep1[step]["model"], - rep0[step]["model"], + rep1[step]["original_params"], + rep0[step]["original_params"], check_device=False, - rtol=1e-4, - atol=1e-4, ) torch.testing.assert_close( rep1[step]["outer_optim"],