Skip to content

Commit e527183

Browse files
committed
Delay erasing context from contexts_set during acl_idle_update
Fixes issue where context is being deleted from contexts_set while traversing through the same context_set, causing undefined behaviour and sometimes crash in acl_idle_update.
1 parent ab09456 commit e527183

File tree

3 files changed

+81
-51
lines changed

3 files changed

+81
-51
lines changed

include/acl_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1727,6 +1727,8 @@ typedef struct _cl_platform_id
17271727

17281728
// A set of contexts within this platform
17291729
std::set<cl_context> contexts_set;
1730+
// The context that is currently being updated
1731+
cl_context update_context;
17301732

17311733
} _cl_platform_id;
17321734

src/acl_context.cpp

Lines changed: 77 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ static cl_device_id l_find_device_by_name(const std::string &name);
5252
static cl_int l_update_program_library_root(cl_context context,
5353
const char *new_root);
5454
static cl_int l_update_compile_command(cl_context context, const char *new_cmd);
55+
static std::set<cl_context>::iterator l_release_context(cl_context context);
5556

5657
ACL_DEFINE_CL_OBJECT_ALLOC_FUNCTIONS(cl_context);
5758

@@ -275,8 +276,66 @@ CL_API_ENTRY cl_int CL_API_CALL clRetainContext(cl_context context) {
275276
return clRetainContextIntelFPGA(context);
276277
}
277278

279+
static std::set<cl_context>::iterator l_release_context(cl_context context) {
280+
// Make sure we clean up the elf files. This should already be done by
281+
// clReleaseProgram.
282+
for (unsigned i = 0; i < context->num_devices; i++) {
283+
if (context->device[i]->loaded_bin != nullptr)
284+
context->device[i]->loaded_bin->unload_content();
285+
if (context->device[i]->last_bin != nullptr)
286+
context->device[i]->last_bin->unload_content();
287+
}
288+
289+
// We have to close all devices associated with this context so they can be
290+
// opened by other processes
291+
acl_get_hal()->close_devices(context->num_devices, context->device);
292+
293+
// remove the context from the context set in the platform
294+
auto cur_it = acl_platform.contexts_set.find(context);
295+
assert(cur_it != acl_platform.contexts_set.end() &&
296+
"Context to be erased is not in the contexts set!");
297+
std::set<cl_context>::iterator it = acl_platform.contexts_set.erase(cur_it);
298+
299+
context->notify_fn = 0;
300+
context->notify_user_data = 0;
301+
302+
if (context->auto_queue) {
303+
// Really subtle.
304+
// The command queue release needs to see the context as being valid.
305+
// That's why we had to defer the acl_release call.
306+
clReleaseCommandQueue(context->auto_queue);
307+
context->auto_queue = 0; // for clarity
308+
}
309+
310+
if (context->user_event_queue) {
311+
clReleaseCommandQueue(context->user_event_queue);
312+
context->user_event_queue = 0; // for clarity
313+
}
314+
315+
if (context->command_queue) {
316+
acl_free(context->command_queue);
317+
}
318+
319+
clReleaseMemObject(context->unwrapped_host_mem);
320+
321+
l_forcibly_release_allocations(context);
322+
323+
acl_untrack_object(context);
324+
325+
// all that should be left now is the single implicit retain from when
326+
// the cl_context was created
327+
assert(acl_ref_count(context) == 1);
328+
acl_free_cl_context(context);
329+
330+
// Close the profiler output file after the last context release
331+
acl_close_profiler_file();
332+
333+
return it;
334+
}
335+
278336
ACL_EXPORT
279337
CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) {
338+
// printf("clReleaseContextIntelFPGA %p\n", context);
280339
std::scoped_lock lock{acl_mutex_wrapper};
281340

282341
// Error out if the reference count is already 0
@@ -290,6 +349,11 @@ CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) {
290349

291350
if (acl_ref_count(context) > context->num_automatic_references) {
292351
acl_release(context);
352+
} else if (acl_platform.update_context == context) {
353+
assert(context->is_being_freed == 0 &&
354+
"Context is being released more than retained count!");
355+
context->is_being_freed = 1; // Defer freeing context;
356+
return CL_SUCCESS;
293357
} else {
294358
// num_automatic_references is the ref count the context had when it was
295359
// given to the user. So, if we have a ref count equal to that, the user
@@ -307,55 +371,7 @@ CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) {
307371
}
308372
context->is_being_freed = 1;
309373

310-
// Make sure we clean up the elf files. This should already be done by
311-
// clReleaseProgram.
312-
for (unsigned i = 0; i < context->num_devices; i++) {
313-
if (context->device[i]->loaded_bin != nullptr)
314-
context->device[i]->loaded_bin->unload_content();
315-
if (context->device[i]->last_bin != nullptr)
316-
context->device[i]->last_bin->unload_content();
317-
}
318-
319-
// We have to close all devices associated with this context so they can be
320-
// opened by other processes
321-
acl_get_hal()->close_devices(context->num_devices, context->device);
322-
323-
// remove the context from the context set in the platform
324-
acl_platform.contexts_set.erase(context);
325-
326-
context->notify_fn = 0;
327-
context->notify_user_data = 0;
328-
329-
if (context->auto_queue) {
330-
// Really subtle.
331-
// The command queue release needs to see the context as being valid.
332-
// That's why we had to defer the acl_release call.
333-
clReleaseCommandQueue(context->auto_queue);
334-
context->auto_queue = 0; // for clarity
335-
}
336-
337-
if (context->user_event_queue) {
338-
clReleaseCommandQueue(context->user_event_queue);
339-
context->user_event_queue = 0; // for clarity
340-
}
341-
342-
if (context->command_queue) {
343-
acl_free(context->command_queue);
344-
}
345-
346-
clReleaseMemObject(context->unwrapped_host_mem);
347-
348-
l_forcibly_release_allocations(context);
349-
350-
acl_untrack_object(context);
351-
352-
// all that should be left now is the single implicit retain from when
353-
// the cl_context was created
354-
assert(acl_ref_count(context) == 1);
355-
acl_free_cl_context(context);
356-
357-
// Close the profiler output file after the last context release
358-
acl_close_profiler_file();
374+
l_release_context(context);
359375
}
360376

361377
return CL_SUCCESS;
@@ -1203,9 +1219,19 @@ void acl_idle_update(cl_context context) {
12031219
// firstly update the current context
12041220
acl_update_context(context);
12051221
// update the other contexts from the platform
1206-
for (const cl_context _context : acl_platform.contexts_set) {
1207-
if (context != _context)
1222+
for (auto it = acl_platform.contexts_set.begin();
1223+
it != acl_platform.contexts_set.end();) {
1224+
auto _context = *it;
1225+
if (context != _context) {
1226+
acl_platform.update_context = _context;
12081227
acl_update_context(_context);
1228+
acl_platform.update_context = NULL;
1229+
if (_context->is_being_freed) {
1230+
it = l_release_context(_context);
1231+
continue;
1232+
}
1233+
}
1234+
++it;
12091235
}
12101236
// if there are any new updates on the current contect done by updating the
12111237
// other it will be handled in clflush/clfinish or acl_wait_for_event where

src/acl_platform.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,8 @@ void acl_init_platform(void) {
461461
}
462462
}
463463

464+
acl_platform.update_context = NULL;
465+
464466
acl_platform.min_data_type_align_size = ACL_MEM_ALIGN;
465467

466468
acl_platform.single_fp_config =

0 commit comments

Comments
 (0)