@@ -10,6 +10,7 @@ use std::collections::HashSet;
10
10
use std:: sync:: Arc ;
11
11
use std:: time:: Duration ;
12
12
13
+ use crate :: torchftpb:: LighthouseQuorumResponse ;
13
14
use anyhow:: Result ;
14
15
use tokio:: sync:: broadcast;
15
16
use tokio:: sync:: Mutex ;
@@ -72,6 +73,7 @@ pub struct Manager {
72
73
local_addr : SocketAddr ,
73
74
heartbeat_interval : Duration ,
74
75
lighthouse_client : LighthouseServiceClient < Channel > ,
76
+ quorum_retries : i64 ,
75
77
}
76
78
77
79
pub async fn manager_client_new (
@@ -108,6 +110,7 @@ impl Manager {
108
110
world_size : u64 ,
109
111
heartbeat_interval : Duration ,
110
112
connect_timeout : Duration ,
113
+ quorum_retries : i64 ,
111
114
) -> Result < Arc < Self > > {
112
115
let listener = tokio:: net:: TcpListener :: bind ( & bind) . await ?;
113
116
let local_addr = listener. local_addr ( ) ?;
@@ -135,6 +138,7 @@ impl Manager {
135
138
} ) ,
136
139
local_addr : local_addr,
137
140
listener : Mutex :: new ( Some ( listener) ) ,
141
+ quorum_retries,
138
142
} ) )
139
143
}
140
144
@@ -197,21 +201,13 @@ impl Manager {
197
201
198
202
// TODO: don't hold the lock during quorum
199
203
200
- let mut client = self . lighthouse_client . clone ( ) ;
201
-
202
- let mut lighthouse_request = tonic:: Request :: new ( LighthouseQuorumRequest {
204
+ let lighthouse_request = LighthouseQuorumRequest {
203
205
requester : Some ( requester) ,
204
- } ) ;
205
- lighthouse_request. set_timeout ( timeout) ;
206
+ } ;
206
207
207
- let response = tokio:: time:: timeout ( timeout, client. quorum ( lighthouse_request) )
208
- . await
209
- . unwrap_or_else ( |e| {
210
- Err ( Status :: cancelled ( format ! (
211
- "lighthouse quorum timed out: {}" ,
212
- e. to_string( )
213
- ) ) )
214
- } ) ?;
208
+ let response = self
209
+ . _quorum_with_retries ( timeout, lighthouse_request)
210
+ . await ?;
215
211
let resp = response. into_inner ( ) ;
216
212
217
213
info_with_replica ! ( self . replica_id, "got lighthouse quorum {:?}" , resp) ;
@@ -226,6 +222,45 @@ impl Manager {
226
222
227
223
Ok ( ( ) )
228
224
}
225
+
226
+ async fn _quorum_with_retries (
227
+ & self ,
228
+ timeout : Duration ,
229
+ lighthouse_request : LighthouseQuorumRequest ,
230
+ ) -> Result < tonic:: Response < LighthouseQuorumResponse > , Status > {
231
+ let mut client = self . lighthouse_client . clone ( ) ;
232
+
233
+ let mut retry_count = 0 ;
234
+ loop {
235
+ let mut request = tonic:: Request :: new ( lighthouse_request. clone ( ) ) ;
236
+ request. set_timeout ( timeout) ;
237
+
238
+ let result = tokio:: time:: timeout ( timeout, client. quorum ( request) ) . await ;
239
+
240
+ match result {
241
+ Ok ( response) => {
242
+ return response;
243
+ }
244
+ Err ( e) => {
245
+ info_with_replica ! (
246
+ self . replica_id,
247
+ "lighthouse quorum timed out: {}" ,
248
+ e. to_string( )
249
+ ) ;
250
+
251
+ if retry_count == self . quorum_retries {
252
+ return Err ( Status :: internal ( format ! (
253
+ "lighthouse quorum timed out after {} retries" ,
254
+ retry_count
255
+ ) ) ) ;
256
+ }
257
+
258
+ tokio:: time:: sleep ( tokio:: time:: Duration :: from_millis ( 100 ) ) . await ;
259
+ retry_count += 1 ;
260
+ }
261
+ }
262
+ }
263
+ }
229
264
}
230
265
231
266
#[ tonic:: async_trait]
@@ -563,6 +598,7 @@ mod tests {
563
598
2 , // world size
564
599
Duration :: from_millis ( 100 ) , // heartbeat interval
565
600
Duration :: from_secs ( 10 ) , // connect timeout
601
+ 0 , // quorum retries
566
602
)
567
603
. await ?;
568
604
let manager_fut = tokio:: spawn ( manager. _run_grpc ( ) ) ;
@@ -610,6 +646,7 @@ mod tests {
610
646
1 , // world size
611
647
Duration :: from_millis ( 100 ) , // heartbeat interval
612
648
Duration :: from_secs ( 10 ) , // connect timeout
649
+ 0 , // quorum retries
613
650
)
614
651
. await ?;
615
652
let manager_fut = tokio:: spawn ( manager. clone ( ) . run ( ) ) ;
@@ -671,6 +708,7 @@ mod tests {
671
708
1 , // world size
672
709
Duration :: from_millis ( 100 ) , // heartbeat interval
673
710
Duration :: from_secs ( 10 ) , // connect timeout
711
+ 0 , // quorum retries
674
712
)
675
713
. await ?;
676
714
let manager_fut = tokio:: spawn ( manager. clone ( ) . run ( ) ) ;
@@ -737,6 +775,7 @@ mod tests {
737
775
1 , // world size
738
776
Duration :: from_millis ( 100 ) , // heartbeat interval
739
777
Duration :: from_secs ( 10 ) , // connect timeout
778
+ 0 , // quorum retries
740
779
)
741
780
. await ?;
742
781
let manager_fut = tokio:: spawn ( manager. clone ( ) . run ( ) ) ;
0 commit comments