Skip to content

Commit 29bf7b7

Browse files
committed
Delay erasing context from contexts_set during acl_idle_update
Fixes issue where context is being deleted from contexts_set while traversing through the same context_set, causing undefined behaviour and sometimes crash in acl_idle_update.
1 parent 8fb3fc7 commit 29bf7b7

File tree

2 files changed

+83
-51
lines changed

2 files changed

+83
-51
lines changed

include/acl_types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,6 +1027,10 @@ typedef struct _cl_context {
10271027
// Fix re-entrancy of clReleaseContext.
10281028
int is_being_freed;
10291029

1030+
// Is this context in the middle of an idle update?
1031+
// Fix erase while traverse of contexts_set
1032+
int is_being_updated;
1033+
10301034
////////////////////////////
10311035
// Behaviour switches dependent on compiler mode
10321036
//

src/acl_context.cpp

Lines changed: 79 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ static cl_device_id l_find_device_by_name(const std::string &name);
5252
static cl_int l_update_program_library_root(cl_context context,
5353
const char *new_root);
5454
static cl_int l_update_compile_command(cl_context context, const char *new_cmd);
55+
static std::set<cl_context>::iterator l_release_context(cl_context context);
5556

5657
ACL_DEFINE_CL_OBJECT_ALLOC_FUNCTIONS(cl_context);
5758

@@ -243,6 +244,8 @@ CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromTypeIntelFPGA(
243244
if (errcode_ret) {
244245
*errcode_ret = CL_SUCCESS;
245246
}
247+
// the context is created successfully, add it to the set
248+
acl_platform.contexts_set.insert(context);
246249
return context;
247250
}
248251

@@ -275,6 +278,63 @@ CL_API_ENTRY cl_int CL_API_CALL clRetainContext(cl_context context) {
275278
return clRetainContextIntelFPGA(context);
276279
}
277280

281+
static std::set<cl_context>::iterator l_release_context(cl_context context) {
282+
// Make sure we clean up the elf files. This should already be done by
283+
// clReleaseProgram.
284+
for (unsigned i = 0; i < context->num_devices; i++) {
285+
if (context->device[i]->loaded_bin != nullptr)
286+
context->device[i]->loaded_bin->unload_content();
287+
if (context->device[i]->last_bin != nullptr)
288+
context->device[i]->last_bin->unload_content();
289+
}
290+
291+
// We have to close all devices associated with this context so they can be
292+
// opened by other processes
293+
acl_get_hal()->close_devices(context->num_devices, context->device);
294+
295+
// remove the context from the context set in the platform
296+
auto cur_it = acl_platform.contexts_set.find(context);
297+
assert(cur_it != acl_platform.contexts_set.end() &&
298+
"Context to be erased is not in the contexts set!");
299+
std::set<cl_context>::iterator it = acl_platform.contexts_set.erase(cur_it);
300+
301+
context->notify_fn = 0;
302+
context->notify_user_data = 0;
303+
304+
if (context->auto_queue) {
305+
// Really subtle.
306+
// The command queue release needs to see the context as being valid.
307+
// That's why we had to defer the acl_release call.
308+
clReleaseCommandQueue(context->auto_queue);
309+
context->auto_queue = 0; // for clarity
310+
}
311+
312+
if (context->user_event_queue) {
313+
clReleaseCommandQueue(context->user_event_queue);
314+
context->user_event_queue = 0; // for clarity
315+
}
316+
317+
if (context->command_queue) {
318+
acl_free(context->command_queue);
319+
}
320+
321+
clReleaseMemObject(context->unwrapped_host_mem);
322+
323+
l_forcibly_release_allocations(context);
324+
325+
acl_untrack_object(context);
326+
327+
// all that should be left now is the single implicit retain from when
328+
// the cl_context was created
329+
assert(acl_ref_count(context) == 1);
330+
acl_free_cl_context(context);
331+
332+
// Close the profiler output file after the last context release
333+
acl_close_profiler_file();
334+
335+
return it;
336+
}
337+
278338
ACL_EXPORT
279339
CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) {
280340
std::scoped_lock lock{acl_mutex_wrapper};
@@ -290,6 +350,11 @@ CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) {
290350

291351
if (acl_ref_count(context) > context->num_automatic_references) {
292352
acl_release(context);
353+
} else if (context->is_being_updated) {
354+
assert(context->is_being_freed == 0 &&
355+
"Context is being released more than retained count!");
356+
context->is_being_freed = 1; // Defer freeing context;
357+
return CL_SUCCESS;
293358
} else {
294359
// num_automatic_references is the ref count the context had when it was
295360
// given to the user. So, if we have a ref count equal to that, the user
@@ -307,55 +372,7 @@ CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) {
307372
}
308373
context->is_being_freed = 1;
309374

310-
// Make sure we clean up the elf files. This should already be done by
311-
// clReleaseProgram.
312-
for (unsigned i = 0; i < context->num_devices; i++) {
313-
if (context->device[i]->loaded_bin != nullptr)
314-
context->device[i]->loaded_bin->unload_content();
315-
if (context->device[i]->last_bin != nullptr)
316-
context->device[i]->last_bin->unload_content();
317-
}
318-
319-
// We have to close all devices associated with this context so they can be
320-
// opened by other processes
321-
acl_get_hal()->close_devices(context->num_devices, context->device);
322-
323-
// remove the context from the context set in the platform
324-
acl_platform.contexts_set.erase(context);
325-
326-
context->notify_fn = 0;
327-
context->notify_user_data = 0;
328-
329-
if (context->auto_queue) {
330-
// Really subtle.
331-
// The command queue release needs to see the context as being valid.
332-
// That's why we had to defer the acl_release call.
333-
clReleaseCommandQueue(context->auto_queue);
334-
context->auto_queue = 0; // for clarity
335-
}
336-
337-
if (context->user_event_queue) {
338-
clReleaseCommandQueue(context->user_event_queue);
339-
context->user_event_queue = 0; // for clarity
340-
}
341-
342-
if (context->command_queue) {
343-
acl_free(context->command_queue);
344-
}
345-
346-
clReleaseMemObject(context->unwrapped_host_mem);
347-
348-
l_forcibly_release_allocations(context);
349-
350-
acl_untrack_object(context);
351-
352-
// all that should be left now is the single implicit retain from when
353-
// the cl_context was created
354-
assert(acl_ref_count(context) == 1);
355-
acl_free_cl_context(context);
356-
357-
// Close the profiler output file after the last context release
358-
acl_close_profiler_file();
375+
l_release_context(context);
359376
}
360377

361378
return CL_SUCCESS;
@@ -561,6 +578,7 @@ static cl_int l_load_properties(cl_context context,
561578
CL_CONTEXT_COMPILER_MODE_OFFLINE_INTELFPGA);
562579
context->split_kernel = 0;
563580
context->is_being_freed = 0;
581+
context->is_being_updated = 0;
564582
context->eagerly_program_device_with_first_binary = 1;
565583

566584
// This one is only overridden with an environment variable.
@@ -1203,9 +1221,19 @@ void acl_idle_update(cl_context context) {
12031221
// firstly update the current context
12041222
acl_update_context(context);
12051223
// update the other contexts from the platform
1206-
for (const cl_context _context : acl_platform.contexts_set) {
1207-
if (context != _context)
1224+
for (auto it = acl_platform.contexts_set.begin();
1225+
it != acl_platform.contexts_set.end();) {
1226+
auto _context = *it;
1227+
if (context != _context) {
1228+
_context->is_being_updated = 1;
12081229
acl_update_context(_context);
1230+
_context->is_being_updated = 0;
1231+
if (_context->is_being_freed) {
1232+
it = l_release_context(_context);
1233+
continue;
1234+
}
1235+
}
1236+
++it;
12091237
}
12101238
// if there are any new updates on the current contect done by updating the
12111239
// other it will be handled in clflush/clfinish or acl_wait_for_event where

0 commit comments

Comments
 (0)