retry quorum

tushar00jain · tushar00jain · commit 9ae71da9616b · 2025-07-11T18:05:48.000-07:00
Summary:
- we currently don't retry quorum requests from the manager to lighthouse
- if lighthouse crashes, this can result in all replicas crashing
- so add retries configurable through env var
diff --git a/src/lib.rs b/src/lib.rs
@@ -71,6 +71,7 @@ fn num_threads() -> usize {
 ///     world_size (int): The world size of the replica group.
 ///     heartbeat_interval (timedelta): The interval at which heartbeats are sent.
 ///     connect_timeout (timedelta): The timeout for connecting to the lighthouse server.
+///     quorum_retries (int): The number of retries for quorum requests to lighthouse server.
 #[pyclass]
 struct ManagerServer {
     handle: JoinHandle<Result<()>>,
@@ -91,6 +92,7 @@ impl ManagerServer {
         world_size: u64,
         heartbeat_interval: Duration,
         connect_timeout: Duration,
+        quorum_retries: i64,
     ) -> PyResult<Self> {
         py.allow_threads(move || {
             let runtime = tokio::runtime::Builder::new_multi_thread()
@@ -108,6 +110,7 @@ impl ManagerServer {
                     world_size,
                     heartbeat_interval,
                     connect_timeout,
+                    quorum_retries,
                 ))
                 .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
             let handle = runtime.spawn(manager.clone().run());
diff --git a/src/manager.rs b/src/manager.rs
@@ -10,6 +10,7 @@ use std::collections::HashSet;
 use std::sync::Arc;
 use std::time::Duration;
 
+use crate::torchftpb::LighthouseQuorumResponse;
 use anyhow::Result;
 use tokio::sync::broadcast;
 use tokio::sync::Mutex;
@@ -72,6 +73,8 @@ pub struct Manager {
     local_addr: SocketAddr,
     heartbeat_interval: Duration,
     lighthouse_client: LighthouseServiceClient<Channel>,
+    lighthouse_addr: String,
+    quorum_retries: i64,
 }
 
 pub async fn manager_client_new(
@@ -108,6 +111,7 @@ impl Manager {
         world_size: u64,
         heartbeat_interval: Duration,
         connect_timeout: Duration,
+        quorum_retries: i64,
     ) -> Result<Arc<Self>> {
         let listener = tokio::net::TcpListener::bind(&bind).await?;
         let local_addr = listener.local_addr()?;
@@ -120,6 +124,7 @@ impl Manager {
         Ok(Arc::new(Self {
             replica_id: replica_id,
             lighthouse_client: client,
+            lighthouse_addr,
             hostname: hostname,
             store_address: store_addr,
             world_size: world_size,
@@ -135,6 +140,7 @@ impl Manager {
             }),
             local_addr: local_addr,
             listener: Mutex::new(Some(listener)),
+            quorum_retries,
         }))
     }
 
@@ -197,21 +203,13 @@ impl Manager {
 
         // TODO: don't hold the lock during quorum
 
-        let mut client = self.lighthouse_client.clone();
-
-        let mut lighthouse_request = tonic::Request::new(LighthouseQuorumRequest {
+        let lighthouse_request = LighthouseQuorumRequest {
             requester: Some(requester),
-        });
-        lighthouse_request.set_timeout(timeout);
+        };
 
-        let response = tokio::time::timeout(timeout, client.quorum(lighthouse_request))
-            .await
-            .unwrap_or_else(|e| {
-                Err(Status::cancelled(format!(
-                    "lighthouse quorum timed out: {}",
-                    e.to_string()
-                )))
-            })?;
+        let response = self
+            ._quorum_with_retries(timeout, lighthouse_request)
+            .await?;
         let resp = response.into_inner();
 
         info_with_replica!(self.replica_id, "got lighthouse quorum {:?}", resp);
@@ -226,6 +224,51 @@ impl Manager {
 
         Ok(())
     }
+
+    async fn _quorum_with_retries(
+        &self,
+        timeout: Duration,
+        lighthouse_request: LighthouseQuorumRequest,
+    ) -> Result<tonic::Response<LighthouseQuorumResponse>, Status> {
+        let mut client = self.lighthouse_client.clone();
+
+        let mut retry_count = 0;
+        loop {
+            let mut request = tonic::Request::new(lighthouse_request.clone());
+            request.set_timeout(timeout);
+
+            let result = tokio::time::timeout(timeout, client.quorum(request)).await;
+
+            match result {
+                Ok(response) => {
+                    return response;
+                }
+                Err(e) => {
+                    info_with_replica!(
+                        self.replica_id,
+                        "lighthouse quorum failed. error: {}",
+                        e.to_string()
+                    );
+
+                    if retry_count == self.quorum_retries {
+                        return Err(Status::internal(format!(
+                            "lighthouse quorum failed after {} retries. error: {}",
+                            retry_count,
+                            e.to_string(),
+                        )));
+                    }
+
+                    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+                    // Reset the client since the lighthouse server might have failed
+                    self.lighthouse_client =
+                        lighthouse_client_new(lighthouse_addr.clone(), connect_timeout).await?;
+
+                    retry_count += 1;
+                }
+            }
+        }
+    }
 }
 
 #[tonic::async_trait]
@@ -563,6 +606,7 @@ mod tests {
             2,                          // world size
             Duration::from_millis(100), // heartbeat interval
             Duration::from_secs(10),    // connect timeout
+            0,                          // quorum retries
         )
         .await?;
         let manager_fut = tokio::spawn(manager._run_grpc());
@@ -610,6 +654,7 @@ mod tests {
             1,                          // world size
             Duration::from_millis(100), // heartbeat interval
             Duration::from_secs(10),    // connect timeout
+            0,                          // quorum retries
         )
         .await?;
         let manager_fut = tokio::spawn(manager.clone().run());
@@ -671,6 +716,7 @@ mod tests {
                     1,                          // world size
                     Duration::from_millis(100), // heartbeat interval
                     Duration::from_secs(10),    // connect timeout
+                    0,                          // quorum retries
                 )
                 .await?;
                 let manager_fut = tokio::spawn(manager.clone().run());
@@ -737,6 +783,7 @@ mod tests {
             1,                          // world size
             Duration::from_millis(100), // heartbeat interval
             Duration::from_secs(10),    // connect timeout
+            0,                          // quorum retries
         )
         .await?;
         let manager_fut = tokio::spawn(manager.clone().run());
diff --git a/torchft/_torchft.pyi b/torchft/_torchft.pyi
@@ -48,6 +48,7 @@ class ManagerServer:
         world_size: int,
         heartbeat_interval: timedelta,
         connect_timeout: timedelta,
+        quorum_retries: int,
     ) -> None: ...
     def address(self) -> str: ...
     def shutdown(self) -> None: ...
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -67,6 +67,11 @@
 QUORUM_TIMEOUT_SEC_ENV: str = "TORCHFT_QUORUM_TIMEOUT_SEC"
 CONNECT_TIMEOUT_SEC_ENV: str = "TORCHFT_CONNECT_TIMEOUT_SEC"
 
+# Environment variable for the number of retries to use for the quorum.
+# We need to retry quorum in case lighthouse fails. Otherwise, if we
+# crash if call to quorum fails, all replicas will crash.
+QUORUM_RETRIES_ENV: str = "TORCHFT_QUORUM_RETRIES"
+
 T = TypeVar("T")
 
 
@@ -277,6 +282,7 @@ def __init__(
                 world_size=group_world_size,
                 heartbeat_interval=heartbeat_interval,
                 connect_timeout=connect_timeout,
+                quorum_retries=int(os.environ.get(QUORUM_RETRIES_ENV, "0")),
             )
 
             self._store.set(MANAGER_ADDR_KEY, self._manager.address())