Skip to content

Commit 568659a

Browse files
committed
Resolves hang when the device op queue gets full and no more commands can be submitted
1 parent aede25e commit 568659a

File tree

1 file changed

+30
-17
lines changed

1 file changed

+30
-17
lines changed

src/acl_command_queue.cpp

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -677,11 +677,13 @@ void acl_try_FastKernelRelaunch_ooo_queue_event_dependents(cl_event parent) {
677677

678678
// Fast Kernel Relaunch: submitting is safe even though has dependency
679679
// Prior to submitting remove dependency
680-
dependent->depend_on.erase(parent);
681-
dependent_it = parent->depend_on_me.erase(dependent_it);
682-
dependent_it--; // decrement it other wise we will skip an element
683-
dependent->command_queue->num_commands_submitted++;
684-
acl_submit_command(dependent);
680+
int local_updates = acl_submit_command(dependent);
681+
if (local_updates) {
682+
dependent->depend_on.erase(parent);
683+
dependent_it = parent->depend_on_me.erase(dependent_it);
684+
dependent_it--; // decrement it otherwise we will skip an element
685+
dependent->command_queue->num_commands_submitted++;
686+
}
685687
}
686688
}
687689

@@ -691,19 +693,26 @@ int acl_update_ooo_queue(cl_command_queue command_queue) {
691693
// Directly submit the event if it has no dependencies
692694
// unless it is a user_event queue which never submits events
693695
while (!command_queue->new_commands.empty()) {
696+
int success = 1;
694697
cl_event event = command_queue->new_commands.front();
695698
if (command_queue->submits_commands &&
696699
event->execution_status == CL_QUEUED) {
697700
if (event->depend_on.empty()) {
698701
command_queue->num_commands_submitted++;
699-
acl_submit_command(event);
702+
success = acl_submit_command(event);
700703
} else {
704+
// This is allowed to fail, so no need to mark success as false
705+
// dependent events that fail to be FKRd will still be picked up when
706+
// their parent event finishes
701707
acl_try_FastKernelRelaunch_ooo_queue_event_dependents(
702708
*(event->depend_on.begin()));
703709
}
704710
}
705-
// safe to pop as there is a master copy in command_queue->commands
706-
command_queue->new_commands.pop_front();
711+
712+
if (success) {
713+
// safe to pop as there is a master copy in command_queue->commands
714+
command_queue->new_commands.pop_front();
715+
}
707716
}
708717

709718
// Remove dependencies on completed events, and launch any events
@@ -731,9 +740,10 @@ int acl_update_ooo_queue(cl_command_queue command_queue) {
731740
if ((dependent->command_queue->properties &
732741
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) &&
733742
dependent->cmd.type != CL_COMMAND_USER) {
734-
dependent->command_queue
735-
->num_commands_submitted++; // dependent might be on another queue
736-
num_updates += acl_submit_command(dependent);
743+
int local_updates = acl_submit_command(dependent);
744+
dependent->command_queue->num_commands_submitted +=
745+
local_updates; // dependent might be on another queue
746+
num_updates += local_updates;
737747
}
738748
}
739749
}
@@ -879,8 +889,9 @@ int acl_update_inorder_queue(cl_command_queue command_queue) {
879889
}
880890

881891
if (command_queue->num_commands_submitted == 0) {
882-
command_queue->num_commands_submitted++;
883-
num_updates += acl_submit_command(event);
892+
int local_updates = acl_submit_command(event);
893+
command_queue->num_commands_submitted += local_updates;
894+
num_updates += local_updates;
884895
continue; // there might be another kernel behind us that can be
885896
// submitted aswell
886897
} else {
@@ -900,8 +911,9 @@ int acl_update_inorder_queue(cl_command_queue command_queue) {
900911
if (submitted_event->last_device_op->status <= CL_SUBMITTED) {
901912
// Assumption: last device_op of the submitted kernel event is a
902913
// kernel_op
903-
command_queue->num_commands_submitted++;
904-
num_updates += acl_submit_command(event);
914+
int local_updates = acl_submit_command(event);
915+
command_queue->num_commands_submitted += local_updates;
916+
num_updates += local_updates;
905917
continue; // there might be another kernel behind us that can be
906918
// submitted aswell
907919
}
@@ -915,8 +927,9 @@ int acl_update_inorder_queue(cl_command_queue command_queue) {
915927
event->depend_on.empty()) {
916928
// it is safe to submit: nothing else submitted AND all dependencies
917929
// are resolved
918-
command_queue->num_commands_submitted++;
919-
num_updates += acl_submit_command(event);
930+
int local_updates = acl_submit_command(event);
931+
command_queue->num_commands_submitted += local_updates;
932+
num_updates += local_updates;
920933
}
921934
break; // no more events can be submitted
922935
}

0 commit comments

Comments
 (0)