Skip to content

Commit f00c858

Browse files
committed
kernel: force reprogram if kernel device has device global with init_mode reprogram and extend unit test
1 parent cd2164e commit f00c858

File tree

2 files changed

+201
-17
lines changed

2 files changed

+201
-17
lines changed

src/acl_kernel.cpp

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3041,23 +3041,33 @@ int acl_submit_kernel_device_op(cl_event event) {
30413041
// to free up old operation slots.
30423042
acl_forget_proposed_device_ops(doq);
30433043

3044+
// Force reprogram if there is device global with init_mode:reprogram
3045+
// or if this is the split kernel mode (split kernel workaround)
30443046
bool need_reprogram = true;
3045-
if (device->last_bin) {
3046-
// compare hash of last program that went through device op queue and the
3047-
// program required by kernel
3048-
need_reprogram =
3049-
device->last_bin->get_devdef().autodiscovery_def.binary_rand_hash !=
3050-
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3051-
} else {
3052-
// compare hash of program that is on the device and the program required by
3053-
// kernel
3054-
need_reprogram = device->def.autodiscovery_def.binary_rand_hash !=
3055-
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3056-
}
3057-
3058-
if (event->context->split_kernel) {
3059-
// Always reprogram in split kernel mode. This is a temporary workaround.
3060-
need_reprogram = true;
3047+
// Else check if reprogram is needed based on hash of loaded binary
3048+
// First try to find if there are any reprogram device globals
3049+
std::unordered_map<std::string, acl_device_global_mem_def_t>
3050+
device_global_mem_defs =
3051+
device->def.autodiscovery_def.device_global_mem_defs;
3052+
const auto reprogram_it = std::find_if(
3053+
device_global_mem_defs.begin(), device_global_mem_defs.end(),
3054+
[](const auto &it) {
3055+
return it.second.init_mode == ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM;
3056+
});
3057+
if (!event->context->split_kernel &&
3058+
(reprogram_it == device_global_mem_defs.end())) {
3059+
if (device->last_bin) {
3060+
// compare hash of last program that went through device op queue and the
3061+
// program required by kernel
3062+
need_reprogram =
3063+
device->last_bin->get_devdef().autodiscovery_def.binary_rand_hash !=
3064+
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3065+
} else {
3066+
// compare hash of program that is on the device and the program required
3067+
// by kernel
3068+
need_reprogram = device->def.autodiscovery_def.binary_rand_hash !=
3069+
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3070+
}
30613071
}
30623072

30633073
if (need_reprogram) {

test/acl_kernel_test.cpp

Lines changed: 175 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3968,7 +3968,7 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) {
39683968
// set MEM_MIGRATE2.1 to COMPLETE +
39693969
// set MEM_MIGRATE2.2 to RUNNING +
39703970
// set MEM_MIGRATE2.2 to COMPLETE +
3971-
// submit KERNEL2 to device = 5
3971+
// submit KERNEL2 to device = 10
39723972
CHECK_EQUAL(offset + 15, m_devlog.num_ops);
39733973

39743974
// Should have copied the memory over.
@@ -4332,6 +4332,180 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) {
43324332
CHECK_EQUAL(CL_SUCCESS, clReleaseCommandQueue(cq2));
43334333
}
43344334

4335+
TEST(acl_kernel_reprogram_scheduler, device_global_reprogram) {
4336+
// In this test, we will force the device to contain reprogram
4337+
// device global. The device will be first reprogrammed eagerly
4338+
// due to the clCreateProgramWithBinary call, then when the
4339+
// kernel is enqueued, another reprogram should be scheduled
4340+
// even though the device is already programmed with the right
4341+
// binary, due to the presence of the device global.
4342+
4343+
// Force device to contain device global
4344+
m_device->def.autodiscovery_def.device_global_mem_defs.insert(
4345+
{"dev_glob1",
4346+
{/* address */ 1024,
4347+
/* size */ 1024,
4348+
/* host_access */ ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_WRITE,
4349+
/* init_mode */ ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM,
4350+
/* implement_in_csr */ false}});
4351+
4352+
// Initial eager reprogram
4353+
int offset = m_devlog.num_ops;
4354+
CHECK_EQUAL(3, offset);
4355+
4356+
acl_device_program_info_t *dp0 = check_dev_prog(m_program0);
4357+
4358+
m_context->reprogram_buf_read_callback = read_mem_callback;
4359+
m_context->reprogram_buf_write_callback = write_mem_callback;
4360+
4361+
// A device side buffer
4362+
cl_int status = CL_INVALID_VALUE;
4363+
cl_mem mem = clCreateBuffer(m_context, CL_MEM_READ_WRITE, 2048, 0, &status);
4364+
CHECK_EQUAL(CL_SUCCESS, status);
4365+
CHECK(mem);
4366+
memset(mem->host_mem.aligned_ptr, 'X', mem->size);
4367+
memset(mem->block_allocation->range.begin, 'x', mem->size);
4368+
4369+
CHECK_EQUAL(1, m_context->device_buffers_have_backing_store);
4370+
CHECK_EQUAL(0, mem->block_allocation->region->is_host_accessible);
4371+
CHECK_EQUAL(0, mem->writable_copy_on_host);
4372+
4373+
cl_kernel k = get_kernel(m_program0);
4374+
4375+
// Just the initial program load.
4376+
CHECK_EQUAL(m_first_dev_bin, m_device->last_bin);
4377+
CHECK_EQUAL(m_first_dev_bin, m_device->loaded_bin);
4378+
4379+
cl_event ue = get_user_event();
4380+
cl_event k_e = 0;
4381+
4382+
CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 0, sizeof(cl_mem), &mem));
4383+
CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 1, sizeof(cl_mem), &mem));
4384+
CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k, 1, &ue, &k_e));
4385+
CHECK_EQUAL(CL_COMMAND_TASK, k_e->cmd.type);
4386+
4387+
// Only initial programming has occurred.
4388+
// Has 3 transitions logged: SUBMITTED, RUNNING, COMPLETE
4389+
CHECK_EQUAL(m_first_dev_bin, m_device->last_bin);
4390+
CHECK_EQUAL(m_first_dev_bin, m_device->loaded_bin);
4391+
4392+
acl_print_debug_msg("Forcing user event completion\n");
4393+
CHECK_EQUAL(CL_SUCCESS, clSetUserEventStatus(ue, CL_COMPLETE));
4394+
CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(ue));
4395+
4396+
// Should have recorded that we loaded the program.
4397+
CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin);
4398+
CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin);
4399+
4400+
// submit device global forced REPROGRAM +
4401+
// set REPROGRAM to RUNNING +
4402+
// set REPROGRAM to COMPLETE +
4403+
// set MEM_MIGRATE 1 to RUNNING +
4404+
// set MEM_MIGRATE 1 to COMPLETE +
4405+
// set MEM_MIGRATE 2 to RUNNING +
4406+
// set MEM_MIGRATE 2 to COMPLETE +
4407+
// submit KERNEL = 8
4408+
CHECK_EQUAL(offset + 8, m_devlog.num_ops);
4409+
const acl_device_op_t *op0submit = &(m_devlog.before[3]);
4410+
const acl_device_op_t *op0running = &(m_devlog.before[4]);
4411+
const acl_device_op_t *op0complete = &(m_devlog.before[5]);
4412+
4413+
// Device global forced reprogram
4414+
CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0submit->info.type);
4415+
CHECK_EQUAL(0, op0submit->id);
4416+
CHECK(op0submit->info.event);
4417+
CHECK_EQUAL(CL_SUBMITTED, op0submit->status);
4418+
CHECK_EQUAL(0, op0submit->info.num_printf_bytes_pending);
4419+
CHECK_EQUAL(1, op0submit->first_in_group);
4420+
CHECK_EQUAL(0, op0submit->last_in_group);
4421+
4422+
CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0running->info.type);
4423+
CHECK_EQUAL(0, op0running->id);
4424+
CHECK(op0running->info.event);
4425+
CHECK_EQUAL(CL_RUNNING, op0running->status);
4426+
CHECK_EQUAL(0, op0running->info.num_printf_bytes_pending);
4427+
CHECK_EQUAL(1, op0running->first_in_group);
4428+
CHECK_EQUAL(0, op0running->last_in_group);
4429+
4430+
CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0complete->info.type);
4431+
CHECK_EQUAL(0, op0complete->id);
4432+
CHECK(op0complete->info.event);
4433+
CHECK_EQUAL(CL_COMPLETE, op0complete->status);
4434+
CHECK_EQUAL(0, op0complete->info.num_printf_bytes_pending);
4435+
CHECK_EQUAL(1, op0complete->first_in_group);
4436+
CHECK_EQUAL(0, op0complete->last_in_group);
4437+
4438+
// The device is still programmed with the same program.
4439+
CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin);
4440+
CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin);
4441+
4442+
const acl_device_op_t *op1submit = &(m_devlog.before[10]);
4443+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op1submit->info.type);
4444+
CHECK_EQUAL(k_e, op1submit->info.event);
4445+
CHECK_EQUAL(CL_SUBMITTED, op1submit->status);
4446+
CHECK_EQUAL(0, op1submit->info.num_printf_bytes_pending);
4447+
CHECK_EQUAL(0, op1submit->first_in_group); // reprogram is first
4448+
CHECK_EQUAL(1, op1submit->last_in_group);
4449+
4450+
// The user-level event is linked to the kernel device op now.
4451+
CHECK_EQUAL(op1submit->id, k_e->current_device_op->id);
4452+
4453+
// Pretend to start the kernel
4454+
acl_print_debug_msg("Say kernel is running\n");
4455+
ACL_LOCKED(acl_receive_kernel_update(k_e->current_device_op->id, CL_RUNNING));
4456+
CHECK_EQUAL(CL_RUNNING, k_e->current_device_op->execution_status);
4457+
4458+
ACL_LOCKED(acl_idle_update(m_context));
4459+
4460+
// Now we have a "running" transition
4461+
CHECK_EQUAL(offset + 9, m_devlog.num_ops);
4462+
const acl_device_op_t *op2a = &(m_devlog.after[11]);
4463+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op2a->info.type);
4464+
CHECK_EQUAL(k_e, op2a->info.event);
4465+
CHECK_EQUAL(CL_RUNNING, op2a->status);
4466+
CHECK_EQUAL(0, op2a->info.num_printf_bytes_pending);
4467+
CHECK_EQUAL(0, op2a->first_in_group);
4468+
CHECK_EQUAL(1, op2a->last_in_group);
4469+
4470+
// The running status was propagated up to the user-level event.
4471+
CHECK_EQUAL(CL_RUNNING, k_e->execution_status);
4472+
4473+
acl_print_debug_msg("Say kernel is complete\n");
4474+
ACL_LOCKED(
4475+
acl_receive_kernel_update(k_e->current_device_op->id, CL_COMPLETE));
4476+
CHECK_EQUAL(CL_COMPLETE, k_e->current_device_op->execution_status);
4477+
4478+
ACL_LOCKED(acl_idle_update(m_context));
4479+
// Now we have a "complete" transition
4480+
CHECK_EQUAL(offset + 10, m_devlog.num_ops);
4481+
const acl_device_op_t *op3a = &(m_devlog.after[12]);
4482+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op3a->info.type);
4483+
CHECK_EQUAL(k_e, op3a->info.event);
4484+
CHECK_EQUAL(CL_COMPLETE, op3a->status);
4485+
CHECK_EQUAL(0, op3a->info.num_printf_bytes_pending);
4486+
CHECK_EQUAL(0, op3a->first_in_group);
4487+
CHECK_EQUAL(1, op3a->last_in_group);
4488+
4489+
// Completion timestamp has propagated up to the user level event.
4490+
CHECK_EQUAL(acl_platform.device_op_queue.op[op3a->id].timestamp[CL_COMPLETE],
4491+
k_e->timestamp[CL_COMPLETE]);
4492+
4493+
// Completion wipes out the downlink.
4494+
CHECK_EQUAL(0, k_e->current_device_op);
4495+
4496+
// And let go.
4497+
// (Don't check for CL_INVALID_EVENT on a second release of each of
4498+
// these events because the events might be reused.)
4499+
CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(mem));
4500+
CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(k_e));
4501+
CHECK_EQUAL(CL_SUCCESS, clReleaseKernel(k));
4502+
4503+
// Clean up device global
4504+
m_device->def.autodiscovery_def.device_global_mem_defs.clear();
4505+
4506+
acl_print_debug_msg("DONE!\n");
4507+
}
4508+
43354509
TEST(acl_kernel_reprogram_scheduler, use_host_buf_as_arg) {
43364510
// Must be able to use a host-side buffer as a kernel argument.
43374511
cl_int status = 0;

0 commit comments

Comments
 (0)