@@ -10,6 +10,7 @@ use std::collections::HashSet;
10
10
use std:: sync:: Arc ;
11
11
use std:: time:: Duration ;
12
12
13
+ use crate :: torchftpb:: LighthouseQuorumResponse ;
13
14
use anyhow:: Result ;
14
15
use tokio:: sync:: broadcast;
15
16
use tokio:: sync:: Mutex ;
@@ -60,6 +61,8 @@ struct ManagerState {
60
61
should_commit_channel : broadcast:: Sender < bool > ,
61
62
should_commit_failures : HashSet < i64 > ,
62
63
should_commit_count : HashSet < i64 > ,
64
+
65
+ lighthouse_client : LighthouseServiceClient < Channel > ,
63
66
}
64
67
65
68
pub struct Manager {
@@ -71,7 +74,9 @@ pub struct Manager {
71
74
listener : Mutex < Option < tokio:: net:: TcpListener > > ,
72
75
local_addr : SocketAddr ,
73
76
heartbeat_interval : Duration ,
74
- lighthouse_client : LighthouseServiceClient < Channel > ,
77
+ lighthouse_addr : String ,
78
+ connect_timeout : Duration ,
79
+ quorum_retries : i64 ,
75
80
}
76
81
77
82
pub async fn manager_client_new (
@@ -108,6 +113,7 @@ impl Manager {
108
113
world_size : u64 ,
109
114
heartbeat_interval : Duration ,
110
115
connect_timeout : Duration ,
116
+ quorum_retries : i64 ,
111
117
) -> Result < Arc < Self > > {
112
118
let listener = tokio:: net:: TcpListener :: bind ( & bind) . await ?;
113
119
let local_addr = listener. local_addr ( ) ?;
@@ -119,7 +125,8 @@ impl Manager {
119
125
120
126
Ok ( Arc :: new ( Self {
121
127
replica_id : replica_id,
122
- lighthouse_client : client,
128
+ lighthouse_addr,
129
+ connect_timeout,
123
130
hostname : hostname,
124
131
store_address : store_addr,
125
132
world_size : world_size,
@@ -132,9 +139,12 @@ impl Manager {
132
139
should_commit_channel : should_commit_tx,
133
140
should_commit_count : HashSet :: new ( ) ,
134
141
should_commit_failures : HashSet :: new ( ) ,
142
+
143
+ lighthouse_client : client,
135
144
} ) ,
136
145
local_addr : local_addr,
137
146
listener : Mutex :: new ( Some ( listener) ) ,
147
+ quorum_retries,
138
148
} ) )
139
149
}
140
150
@@ -170,8 +180,12 @@ impl Manager {
170
180
}
171
181
172
182
async fn _run_heartbeat ( self : Arc < Self > ) -> Result < ( ) > {
173
- let mut client = self . lighthouse_client . clone ( ) ;
174
183
loop {
184
+ let mut client = {
185
+ let state = self . state . lock ( ) . await ;
186
+ state. lighthouse_client . clone ( )
187
+ } ;
188
+
175
189
let request = tonic:: Request :: new ( LighthouseHeartbeatRequest {
176
190
replica_id : self . replica_id . clone ( ) ,
177
191
} ) ;
@@ -197,21 +211,13 @@ impl Manager {
197
211
198
212
// TODO: don't hold the lock during quorum
199
213
200
- let mut client = self . lighthouse_client . clone ( ) ;
201
-
202
- let mut lighthouse_request = tonic:: Request :: new ( LighthouseQuorumRequest {
214
+ let lighthouse_request = LighthouseQuorumRequest {
203
215
requester : Some ( requester) ,
204
- } ) ;
205
- lighthouse_request. set_timeout ( timeout) ;
216
+ } ;
206
217
207
- let response = tokio:: time:: timeout ( timeout, client. quorum ( lighthouse_request) )
208
- . await
209
- . unwrap_or_else ( |e| {
210
- Err ( Status :: cancelled ( format ! (
211
- "lighthouse quorum timed out: {}" ,
212
- e. to_string( )
213
- ) ) )
214
- } ) ?;
218
+ let response = self
219
+ . _quorum_with_retries ( state, timeout, lighthouse_request)
220
+ . await ?;
215
221
let resp = response. into_inner ( ) ;
216
222
217
223
info_with_replica ! ( self . replica_id, "got lighthouse quorum {:?}" , resp) ;
@@ -226,6 +232,66 @@ impl Manager {
226
232
227
233
Ok ( ( ) )
228
234
}
235
+
236
+ async fn _quorum_with_retries (
237
+ & self ,
238
+ state : & mut ManagerState ,
239
+ timeout : Duration ,
240
+ lighthouse_request : LighthouseQuorumRequest ,
241
+ ) -> Result < tonic:: Response < LighthouseQuorumResponse > , Status > {
242
+ let mut client = state. lighthouse_client . clone ( ) ;
243
+
244
+ let mut retry_count = 0 ;
245
+ loop {
246
+ let mut request = tonic:: Request :: new ( lighthouse_request. clone ( ) ) ;
247
+ request. set_timeout ( timeout) ;
248
+
249
+ let result = tokio:: time:: timeout ( timeout, client. quorum ( request) ) . await ;
250
+
251
+ match result {
252
+ Ok ( response) => {
253
+ return response;
254
+ }
255
+ Err ( e) => {
256
+ info_with_replica ! (
257
+ self . replica_id,
258
+ "lighthouse quorum failed. error: {}" ,
259
+ e. to_string( )
260
+ ) ;
261
+
262
+ if retry_count == self . quorum_retries {
263
+ return Err ( Status :: internal ( format ! (
264
+ "lighthouse quorum failed after {} retries. error: {}" ,
265
+ retry_count,
266
+ e. to_string( ) ,
267
+ ) ) ) ;
268
+ }
269
+
270
+ tokio:: time:: sleep ( tokio:: time:: Duration :: from_millis ( 100 ) ) . await ;
271
+
272
+ // Reset the client since the lighthouse server might have failed
273
+ // If this also fails, consider increasing `connect_timeout`.
274
+ let lighthouse_client =
275
+ lighthouse_client_new ( self . lighthouse_addr . clone ( ) , self . connect_timeout )
276
+ . await ;
277
+
278
+ match lighthouse_client {
279
+ Ok ( client) => {
280
+ state. lighthouse_client = client;
281
+ }
282
+ Err ( e) => {
283
+ return Err ( Status :: internal ( format ! (
284
+ "Failed to connect to lighthouse. error: {}" ,
285
+ e. to_string( ) ,
286
+ ) ) ) ;
287
+ }
288
+ }
289
+
290
+ retry_count += 1 ;
291
+ }
292
+ }
293
+ }
294
+ }
229
295
}
230
296
231
297
#[ tonic:: async_trait]
@@ -563,6 +629,7 @@ mod tests {
563
629
2 , // world size
564
630
Duration :: from_millis ( 100 ) , // heartbeat interval
565
631
Duration :: from_secs ( 10 ) , // connect timeout
632
+ 0 , // quorum retries
566
633
)
567
634
. await ?;
568
635
let manager_fut = tokio:: spawn ( manager. _run_grpc ( ) ) ;
@@ -610,6 +677,7 @@ mod tests {
610
677
1 , // world size
611
678
Duration :: from_millis ( 100 ) , // heartbeat interval
612
679
Duration :: from_secs ( 10 ) , // connect timeout
680
+ 0 , // quorum retries
613
681
)
614
682
. await ?;
615
683
let manager_fut = tokio:: spawn ( manager. clone ( ) . run ( ) ) ;
@@ -671,6 +739,7 @@ mod tests {
671
739
1 , // world size
672
740
Duration :: from_millis ( 100 ) , // heartbeat interval
673
741
Duration :: from_secs ( 10 ) , // connect timeout
742
+ 0 , // quorum retries
674
743
)
675
744
. await ?;
676
745
let manager_fut = tokio:: spawn ( manager. clone ( ) . run ( ) ) ;
@@ -737,6 +806,7 @@ mod tests {
737
806
1 , // world size
738
807
Duration :: from_millis ( 100 ) , // heartbeat interval
739
808
Duration :: from_secs ( 10 ) , // connect timeout
809
+ 0 , // quorum retries
740
810
)
741
811
. await ?;
742
812
let manager_fut = tokio:: spawn ( manager. clone ( ) . run ( ) ) ;
0 commit comments