@@ -10,6 +10,7 @@ use std::collections::HashSet;
10
10
use std:: sync:: Arc ;
11
11
use std:: time:: Duration ;
12
12
13
+ use crate :: torchftpb:: LighthouseQuorumResponse ;
13
14
use anyhow:: Result ;
14
15
use tokio:: sync:: broadcast;
15
16
use tokio:: sync:: Mutex ;
@@ -72,6 +73,8 @@ pub struct Manager {
72
73
local_addr : SocketAddr ,
73
74
heartbeat_interval : Duration ,
74
75
lighthouse_client : LighthouseServiceClient < Channel > ,
76
+ lighthouse_addr : String ,
77
+ quorum_retries : i64 ,
75
78
}
76
79
77
80
pub async fn manager_client_new (
@@ -108,6 +111,7 @@ impl Manager {
108
111
world_size : u64 ,
109
112
heartbeat_interval : Duration ,
110
113
connect_timeout : Duration ,
114
+ quorum_retries : i64 ,
111
115
) -> Result < Arc < Self > > {
112
116
let listener = tokio:: net:: TcpListener :: bind ( & bind) . await ?;
113
117
let local_addr = listener. local_addr ( ) ?;
@@ -120,6 +124,7 @@ impl Manager {
120
124
Ok ( Arc :: new ( Self {
121
125
replica_id : replica_id,
122
126
lighthouse_client : client,
127
+ lighthouse_addr,
123
128
hostname : hostname,
124
129
store_address : store_addr,
125
130
world_size : world_size,
@@ -135,6 +140,7 @@ impl Manager {
135
140
} ) ,
136
141
local_addr : local_addr,
137
142
listener : Mutex :: new ( Some ( listener) ) ,
143
+ quorum_retries,
138
144
} ) )
139
145
}
140
146
@@ -197,21 +203,13 @@ impl Manager {
197
203
198
204
// TODO: don't hold the lock during quorum
199
205
200
- let mut client = self . lighthouse_client . clone ( ) ;
201
-
202
- let mut lighthouse_request = tonic:: Request :: new ( LighthouseQuorumRequest {
206
+ let lighthouse_request = LighthouseQuorumRequest {
203
207
requester : Some ( requester) ,
204
- } ) ;
205
- lighthouse_request. set_timeout ( timeout) ;
208
+ } ;
206
209
207
- let response = tokio:: time:: timeout ( timeout, client. quorum ( lighthouse_request) )
208
- . await
209
- . unwrap_or_else ( |e| {
210
- Err ( Status :: cancelled ( format ! (
211
- "lighthouse quorum timed out: {}" ,
212
- e. to_string( )
213
- ) ) )
214
- } ) ?;
210
+ let response = self
211
+ . _quorum_with_retries ( timeout, lighthouse_request)
212
+ . await ?;
215
213
let resp = response. into_inner ( ) ;
216
214
217
215
info_with_replica ! ( self . replica_id, "got lighthouse quorum {:?}" , resp) ;
@@ -226,6 +224,51 @@ impl Manager {
226
224
227
225
Ok ( ( ) )
228
226
}
227
+
228
+ async fn _quorum_with_retries (
229
+ & self ,
230
+ timeout : Duration ,
231
+ lighthouse_request : LighthouseQuorumRequest ,
232
+ ) -> Result < tonic:: Response < LighthouseQuorumResponse > , Status > {
233
+ let mut client = self . lighthouse_client . clone ( ) ;
234
+
235
+ let mut retry_count = 0 ;
236
+ loop {
237
+ let mut request = tonic:: Request :: new ( lighthouse_request. clone ( ) ) ;
238
+ request. set_timeout ( timeout) ;
239
+
240
+ let result = tokio:: time:: timeout ( timeout, client. quorum ( request) ) . await ;
241
+
242
+ match result {
243
+ Ok ( response) => {
244
+ return response;
245
+ }
246
+ Err ( e) => {
247
+ info_with_replica ! (
248
+ self . replica_id,
249
+ "lighthouse quorum failed. error: {}" ,
250
+ e. to_string( )
251
+ ) ;
252
+
253
+ if retry_count == self . quorum_retries {
254
+ return Err ( Status :: internal ( format ! (
255
+ "lighthouse quorum failed after {} retries. error: {}" ,
256
+ retry_count,
257
+ e. to_string( ) ,
258
+ ) ) ) ;
259
+ }
260
+
261
+ tokio:: time:: sleep ( tokio:: time:: Duration :: from_millis ( 100 ) ) . await ;
262
+
263
+ // Reset the client since the lighthouse server might have failed
264
+ self . lighthouse_client =
265
+ lighthouse_client_new ( lighthouse_addr. clone ( ) , connect_timeout) . await ?;
266
+
267
+ retry_count += 1 ;
268
+ }
269
+ }
270
+ }
271
+ }
229
272
}
230
273
231
274
#[ tonic:: async_trait]
@@ -563,6 +606,7 @@ mod tests {
563
606
2 , // world size
564
607
Duration :: from_millis ( 100 ) , // heartbeat interval
565
608
Duration :: from_secs ( 10 ) , // connect timeout
609
+ 0 , // quorum retries
566
610
)
567
611
. await ?;
568
612
let manager_fut = tokio:: spawn ( manager. _run_grpc ( ) ) ;
@@ -610,6 +654,7 @@ mod tests {
610
654
1 , // world size
611
655
Duration :: from_millis ( 100 ) , // heartbeat interval
612
656
Duration :: from_secs ( 10 ) , // connect timeout
657
+ 0 , // quorum retries
613
658
)
614
659
. await ?;
615
660
let manager_fut = tokio:: spawn ( manager. clone ( ) . run ( ) ) ;
@@ -671,6 +716,7 @@ mod tests {
671
716
1 , // world size
672
717
Duration :: from_millis ( 100 ) , // heartbeat interval
673
718
Duration :: from_secs ( 10 ) , // connect timeout
719
+ 0 , // quorum retries
674
720
)
675
721
. await ?;
676
722
let manager_fut = tokio:: spawn ( manager. clone ( ) . run ( ) ) ;
@@ -737,6 +783,7 @@ mod tests {
737
783
1 , // world size
738
784
Duration :: from_millis ( 100 ) , // heartbeat interval
739
785
Duration :: from_secs ( 10 ) , // connect timeout
786
+ 0 , // quorum retries
740
787
)
741
788
. await ?;
742
789
let manager_fut = tokio:: spawn ( manager. clone ( ) . run ( ) ) ;
0 commit comments