diff --git a/include/acl.h b/include/acl.h index ee98f9a5..45bcea7b 100644 --- a/include/acl.h +++ b/include/acl.h @@ -498,10 +498,39 @@ typedef class acl_device_program_info_t *acl_device_program_info; */ #define ACL_MEM_CAPABILITY_P2P (1 << 3) +// Enum values here need to match the SPIRV spec for device global in +// https://github.com/intel/llvm/blob/44c6437684d64aba82d5a3de0e4bbe21d2b1f7ce/sycl/doc/design/spirv-extensions/SPV_INTEL_global_variable_decorations.asciidoc +// ACL_DEVICE_GLOBAL_HOST_ACCESS_TYPE_COUNT is used for validation +// in autodiscovery string parsing and should remain the last constant +// in the enum. +typedef enum { + ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_ONLY, + ACL_DEVICE_GLOBAL_HOST_ACCESS_WRITE_ONLY, + ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_WRITE, + ACL_DEVICE_GLOBAL_HOST_ACCESS_NONE, + + ACL_DEVICE_GLOBAL_HOST_ACCESS_TYPE_COUNT +} acl_device_global_host_access_t; + +// Enum values here also need to match the SPIRV spec for device +// global in the above link for acl_device_global_host_access_t. +// ACL_DEVICE_GLOBAL_INIT_MODE_TYPE_COUNT is used for validation in +// autodiscovery string parsing and should remain the last constant +// in the enum. +typedef enum { + ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM, + ACL_DEVICE_GLOBAL_INIT_MODE_RESET, + + ACL_DEVICE_GLOBAL_INIT_MODE_TYPE_COUNT +} acl_device_global_init_mode_t; + // Definition of device global. struct acl_device_global_mem_def_t { - uint32_t address; + uint64_t address; uint32_t size; + acl_device_global_host_access_t host_access; + acl_device_global_init_mode_t init_mode; + bool implement_in_csr; }; // Part of acl_device_def_t where members are populated from the information diff --git a/include/acl_kernel.h b/include/acl_kernel.h index 1a6f88ef..4b6d9979 100644 --- a/include/acl_kernel.h +++ b/include/acl_kernel.h @@ -54,6 +54,13 @@ void acl_receive_kernel_update(int activation_id, cl_int status); // safe to submit a kernel with subbuffers to the device_op_queue int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration); +// Checks if the program currently loaded on the passed-in device contains +// any device globals with reprogram init mode. When a kernel is submitted +// for the first time and this function returns true, a force reprogram will +// be scheduled even when the kernel binary hash matches the hash of the +// currently loaded program. +bool acl_device_has_reprogram_device_globals(cl_device_id device); + #if defined(__cplusplus) } /* extern "C" */ #endif diff --git a/src/acl_auto_configure.cpp b/src/acl_auto_configure.cpp index 323cba8e..557e0d5c 100644 --- a/src/acl_auto_configure.cpp +++ b/src/acl_auto_configure.cpp @@ -137,6 +137,28 @@ static bool read_uint32_counters(const std::string &str, return true; } +// Reads the next word in str and converts it into an unsigned 64-bit +// fixed-length integer. Note this read utilizes stoull and fail if +// unsigned long long is not 64-bit long on the platform. +// Returns true if a valid integer was read or false if an error occurred. +// pos is updated to the position immediately following the parsed word +// even if an error occurs. +static bool read_uint64_counters(const std::string &str, + std::string::size_type &pos, uint64_t &val, + std::vector &counters) noexcept { + std::string result; + pos = read_word(str, pos, result); + decrement_section_counters(counters); + try { + static_assert(sizeof(uint64_t) == sizeof(unsigned long long)); + val = static_cast(std::stoull(result)); + } catch (const std::exception &e) { + UNREFERENCED_PARAMETER(e); + return false; + } + return true; +} + // Reads the next word in str and converts it into an unsigned. // Returns true if a valid integer was read or false if an error occurred. // pos is updated to the position immediately following the parsed word @@ -470,6 +492,9 @@ static bool read_device_global_mem_defs( total_fields_device_global, counters); } + // Clean up any residual information first + device_global_mem_defs.clear(); + for (auto i = 0U; result && (i < num_device_global); i++) { counters.emplace_back(total_fields_device_global); @@ -481,10 +506,10 @@ static bool read_device_global_mem_defs( } // read device global address - uint32_t dev_global_addr = 0; // Default + uint64_t dev_global_addr = 0; // Default if (result && counters.back() > 0) { result = - read_uint32_counters(config_str, curr_pos, dev_global_addr, counters); + read_uint64_counters(config_str, curr_pos, dev_global_addr, counters); } // read device global address size uint32_t dev_global_size = 0; // Default @@ -493,8 +518,34 @@ static bool read_device_global_mem_defs( read_uint32_counters(config_str, curr_pos, dev_global_size, counters); } - acl_device_global_mem_def_t dev_global_def = {dev_global_addr, - dev_global_size}; + // read device global properties + auto host_access = + static_cast(ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_WRITE); + if (result && counters.back() > 0) { + result = read_uint_counters(config_str, curr_pos, host_access, counters); + if (host_access >= + static_cast(ACL_DEVICE_GLOBAL_HOST_ACCESS_TYPE_COUNT)) + result = false; + } + auto init_mode = + static_cast(ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM); + if (result && counters.back() > 0) { + result = read_uint_counters(config_str, curr_pos, init_mode, counters); + if (init_mode >= + static_cast(ACL_DEVICE_GLOBAL_INIT_MODE_TYPE_COUNT)) + result = false; + } + bool implement_in_csr = false; + if (result && counters.back() > 0) { + result = + read_bool_counters(config_str, curr_pos, implement_in_csr, counters); + } + + acl_device_global_mem_def_t dev_global_def = { + dev_global_addr, dev_global_size, + static_cast(host_access), + static_cast(init_mode), + implement_in_csr}; bool ok = device_global_mem_defs.insert({device_global_name, dev_global_def}) .second; diff --git a/src/acl_device_binary.cpp b/src/acl_device_binary.cpp index 3f44ee68..a3cef603 100644 --- a/src/acl_device_binary.cpp +++ b/src/acl_device_binary.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -262,8 +263,13 @@ cl_int acl_device_binary_t::load_binary_pkg(int validate_compile_options, AND_CHECK(acl_pkg_read_section(pkg, ".acl.rand_hash", pkg_rand_hash.data(), data_len + 1), CL_INVALID_BINARY, FAILREAD_MSG " (rand_hash)"); + // Note that we use dev_prog->device when checking for device global + // Having the same binary suggest that the aocx on the device currently is + // the same as the aocx used to create program, so we can peek the device + // global setup now instead of later after acl_load_device_def_from_str if (dev_prog->device->def.autodiscovery_def.binary_rand_hash == - std::string(pkg_rand_hash.data())) { + std::string(pkg_rand_hash.data()) && + (!acl_device_has_reprogram_device_globals(dev_prog->device))) { dev_prog->device->last_bin = this; dev_prog->device->loaded_bin = this; } diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp index f90b1770..a37d889b 100644 --- a/src/acl_kernel.cpp +++ b/src/acl_kernel.cpp @@ -3011,6 +3011,18 @@ int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration) { return 0; } +bool acl_device_has_reprogram_device_globals(cl_device_id device) { + const auto &device_global_mem_defs = + device->def.autodiscovery_def.device_global_mem_defs; + return device_global_mem_defs.end() != + std::find_if(device_global_mem_defs.begin(), + device_global_mem_defs.end(), + [](const auto &name_and_def) { + return name_and_def.second.init_mode == + ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM; + }); +} + int acl_submit_kernel_device_op(cl_event event) { // No user-level scheduling blocks this kernel enqueue from running. // So submit it to the device op queue. @@ -3049,15 +3061,18 @@ int acl_submit_kernel_device_op(cl_event event) { need_reprogram = device->last_bin->get_devdef().autodiscovery_def.binary_rand_hash != dev_bin->get_devdef().autodiscovery_def.binary_rand_hash; - } else { - // compare hash of program that is on the device and the program required by - // kernel + } else if (!acl_device_has_reprogram_device_globals(device)) { + // last_bin is null suggests there is no reprograms scheduled at this + // point so if the target device contains device global with reprogram + // init mode we force a reprogram, otherwise check random hash + // compare hash of program that is on the device and the program + // required by kernel need_reprogram = device->def.autodiscovery_def.binary_rand_hash != dev_bin->get_devdef().autodiscovery_def.binary_rand_hash; } + // Always reprogram in split kernel mode. This is a temporary workaround. if (event->context->split_kernel) { - // Always reprogram in split kernel mode. This is a temporary workaround. need_reprogram = true; } diff --git a/test/acl_auto_configure_test.cpp b/test/acl_auto_configure_test.cpp index fe1aa742..5cfe6fb1 100644 --- a/test/acl_auto_configure_test.cpp +++ b/test/acl_auto_configure_test.cpp @@ -36,7 +36,7 @@ TEST(auto_configure, simple) { #define VERSIONIDSTRINGIFY(x) #x #define VERSIONIDTOSTR(x) VERSIONIDSTRINGIFY(x) #define DEVICE_FIELDS " 23" -#define DEVICE_FIELDS_DEV_GLOBAL " 30" +#define DEVICE_FIELDS_DEV_GLOBAL " 36" #define DEVICE_FIELDS_OLD " 18" #define BOARDNAME "de4_gen2x4_swdimm" #define BOARDNAME2 "pcie385_a7" @@ -99,10 +99,11 @@ TEST(auto_configure, simple) { // Device global autodiscovery entries #define NUM_DEV_GLOBAL " 2" -#define NUM_DEV_GLOBAL_FIELD " 3" // containing dev_globa_name, address, size -#define DEV_GLOBAL_1 \ - " kernel15_dev_global 4096 2048" // in format of dev_globa_name, address, size -#define DEV_GLOBAL_2 " kernel15_dev_global2 2048 1024" +#define NUM_DEV_GLOBAL_FIELD \ + " 6" // contains dev_globa_name, address, size, host_access, init_mode, + // implement_in_csr with the above format +#define DEV_GLOBAL_1 " kernel15_dev_global 4096 2048 3 1 0" +#define DEV_GLOBAL_2 " kernel15_dev_global2 2048 1024 1 0 1" int parsed; std::string err_str; @@ -283,8 +284,18 @@ TEST(auto_configure, simple) { m_device_def.autodiscovery_def.device_global_mem_defs.end()); CHECK_EQUAL(4096, kernel15_dev_global->second.address); CHECK_EQUAL(2048, kernel15_dev_global->second.size); + CHECK_EQUAL(ACL_DEVICE_GLOBAL_HOST_ACCESS_NONE, + kernel15_dev_global->second.host_access); + CHECK_EQUAL(ACL_DEVICE_GLOBAL_INIT_MODE_RESET, + kernel15_dev_global->second.init_mode); + CHECK_EQUAL(false, kernel15_dev_global->second.implement_in_csr); CHECK_EQUAL(2048, kernel15_dev_global2->second.address); CHECK_EQUAL(1024, kernel15_dev_global2->second.size); + CHECK_EQUAL(ACL_DEVICE_GLOBAL_HOST_ACCESS_WRITE_ONLY, + kernel15_dev_global2->second.host_access); + CHECK_EQUAL(ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM, + kernel15_dev_global2->second.init_mode); + CHECK_EQUAL(true, kernel15_dev_global2->second.implement_in_csr); // Check a second parsing. // It should allocate a new string for the name. @@ -482,11 +493,13 @@ TEST(auto_configure, many_ok_forward_compatibility) { // sections and subsections to check forward compatibility std::string str(VERSIONIDTOSTR( - ACL_AUTO_CONFIGURE_VERSIONID) " 29 " + ACL_AUTO_CONFIGURE_VERSIONID) " 49 " "sample40byterandomhash000000000000000000 " - "a10gx 0 1 15 DDR 2 1 6 0 2147483648 100 " - "100 100 100 200 200 200 200 0 0 0 0 2 " - "1 name1 name2 0 0 47 " + "a10gx 0 1 17 DDR 2 1 6 0 2147483648 100 " + "100 100 100 0 - 0 200 200 200 200 0 0 0 " + "2 9 ms_dev_global1 2048 1024 3 0 0 300 " + "300 300 ms_dev_global2 4096 1024 1 1 1 " + "300 300 300 0 0 400 400 47 " "40 external_sort_stage_0 0 128 1 0 0 1 0 " "1 0 1 10 0 0 4 1 0 0 0 500 500 500 0 0 " "0 0 1 1 1 3 1 1 1 3 1 0 0 800 800 800 " @@ -677,10 +690,10 @@ TEST(auto_configure, many_ok_forward_compatibility) { TEST(auto_configure, many_limit_check) { std::string str(VERSIONIDTOSTR( - ACL_AUTO_CONFIGURE_VERSIONID) " 15 " + ACL_AUTO_CONFIGURE_VERSIONID) " 19 " "sample40byterandomhash000000000000000000 " - "a10gx 0 1 7 DDR 2 1 2 0 2147483648 0 0 0 " - "0 75 " + "a10gx 0 1 9 DDR 2 1 2 0 2147483648 0 - 0 " + "0 0 0 0 0 75 " // 75 kernels "31 external_sort_stage_0 0 128 1 0 0 1 0 " "1 0 1 6 0 0 4 1 0 0 0 0 0 0 1 1 1 3 1 1 1 " "3 1 " @@ -1193,14 +1206,14 @@ TEST(auto_configure, kernel_arg_info) { TEST(auto_configure, hostpipe) { std::string str(VERSIONIDTOSTR( - ACL_AUTO_CONFIGURE_VERSIONID) " 46 " + ACL_AUTO_CONFIGURE_VERSIONID) " 49 " "sample40byterandomhash000000000000000000 " "a10gx_hostpipe 0 1 15 DDR 2 1 6 0 " "2147483648 0 100 100 100 100 200 200 200 " "200 " "2 9 host_to_dev 1 0 32 32768 300 300 300 " "300 dev_to_host 0 1 32 32768 300 300 300 " - "300 400 1 3 name3 400 0 " + "300 400 1 6 dev_global_3 1024 2048 0 0 0 " "1 29 foo 0 128 1 0 0 1 0 1 0 0 0 0 0 0 1 " "1 1 3 1 1 1 3 1 0 0 800 800 800 900 " "900" @@ -1230,10 +1243,10 @@ TEST(auto_configure, hostpipe) { TEST(auto_configure, streaming) { const std::string config_str{ - "23 26 " RANDOM_HASH + "23 29 " RANDOM_HASH " pac_a10 0 1 13 DDR 2 2 24 1 2 0 4294967296 4294967296 8589934592 0 - 0 " - "0 0 0 1 3 device_global_name 256 128 1 105 _ZTS3CRCILi0EE 0 256 1 0 0 1 " - "0 1 0 9 8 0 0 8 1 0 0 1 k0_ZTS3CRCILi0EE_arg0 8 2 1 8 1024 0 3 1 " + "0 0 0 1 6 device_global_name 256 128 0 0 0 1 105 _ZTS3CRCILi0EE 0 256 1 " + "0 0 1 0 1 0 9 8 0 0 8 1 0 0 1 k0_ZTS3CRCILi0EE_arg0 8 2 1 8 1024 0 3 1 " "k0_ZTS3CRCILi0EE_arg1 8 0 0 8 1 0 0 1 k0_ZTS3CRCILi0EE_arg2 7 0 0 8 1 0 " "0 0 7 0 0 8 1 0 0 0 7 2 1 8 1024 0 2 0 7 0 0 8 1 0 0 0 7 0 0 8 1 0 0 0 " "7 0 0 8 1 0 0 0 0 0 1 2 64 4096 1 1 1 3 1 1 1 3 1 0 1 " diff --git a/test/acl_kernel_test.cpp b/test/acl_kernel_test.cpp index 460d4962..6a415e97 100644 --- a/test/acl_kernel_test.cpp +++ b/test/acl_kernel_test.cpp @@ -3968,7 +3968,7 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { // set MEM_MIGRATE2.1 to COMPLETE + // set MEM_MIGRATE2.2 to RUNNING + // set MEM_MIGRATE2.2 to COMPLETE + - // submit KERNEL2 to device = 5 + // submit KERNEL2 to device = 10 CHECK_EQUAL(offset + 15, m_devlog.num_ops); // Should have copied the memory over. @@ -4332,6 +4332,397 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { CHECK_EQUAL(CL_SUCCESS, clReleaseCommandQueue(cq2)); } +TEST(acl_kernel_reprogram_scheduler, device_global_reprogram) { + // In this test, we will force the device to contain reprogram + // device global. The device will be first reprogrammed eagerly + // due to the clCreateProgramWithBinary call which will set the + // last_bin and loaded_bin. We revert that by setting them to + // null again to emulate a hw device with binary on the board + // but not yet reprogrammed in execution. + // The kernel will be launched two times, the first time should + // trigger a reprogram even thought the random hash matches due + // to the device global, the second time shouldn't as the device + // has been reprogrammed in the execution. + + // Force device to contain device global + m_device->def.autodiscovery_def.device_global_mem_defs.insert( + {"dev_glob1", + {/* address */ 1024, + /* size */ 1024, + /* host_access */ ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_WRITE, + /* init_mode */ ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM, + /* implement_in_csr */ false}}); + + // Initial eager reprogram + int offset = m_devlog.num_ops; + CHECK_EQUAL(3, offset); + // Just the initial program load. + CHECK_EQUAL(m_first_dev_bin, m_device->last_bin); + CHECK_EQUAL(m_first_dev_bin, m_device->loaded_bin); + + // Pretend execution starts now + m_device->last_bin->unload_content(); + m_device->last_bin = NULL; + m_device->loaded_bin->unload_content(); + m_device->loaded_bin = NULL; + + acl_device_program_info_t *dp0 = check_dev_prog(m_program0); + m_context->reprogram_buf_read_callback = read_mem_callback; + m_context->reprogram_buf_write_callback = write_mem_callback; + + // A device side buffer + cl_int status = CL_INVALID_VALUE; + cl_mem mem = clCreateBuffer(m_context, CL_MEM_READ_WRITE, 2048, 0, &status); + CHECK_EQUAL(CL_SUCCESS, status); + CHECK(mem); + memset(mem->host_mem.aligned_ptr, 'X', mem->size); + memset(mem->block_allocation->range.begin, 'x', mem->size); + + CHECK_EQUAL(1, m_context->device_buffers_have_backing_store); + CHECK_EQUAL(0, mem->block_allocation->region->is_host_accessible); + CHECK_EQUAL(0, mem->writable_copy_on_host); + + cl_kernel k = get_kernel(m_program0); + cl_event ue1 = get_user_event(); + cl_event ue2 = get_user_event(); + cl_event k_e1 = 0; + cl_event k_e2 = 0; + + // Launch the kernel for the first time + CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 0, sizeof(cl_mem), &mem)); + CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 1, sizeof(cl_mem), &mem)); + CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k, 1, &ue1, &k_e1)); + CHECK_EQUAL(CL_COMMAND_TASK, k_e1->cmd.type); + CHECK(m_device->def.autodiscovery_def.binary_rand_hash == + k_e1->cmd.info.ndrange_kernel.dev_bin->get_devdef() + .autodiscovery_def.binary_rand_hash); + + // last_bin and loaded_bin should still in a reset state + CHECK(m_device->last_bin == NULL); + CHECK(m_device->loaded_bin == NULL); + + acl_print_debug_msg("Forcing user event completion for first kernel\n"); + CHECK_EQUAL(CL_SUCCESS, clSetUserEventStatus(ue1, CL_COMPLETE)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(ue1)); + + // Should have recorded that we loaded the program. + CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin); + CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin); + + // submit device global forced REPROGRAM + + // set REPROGRAM to RUNNING + + // set REPROGRAM to COMPLETE + + // set MEM_MIGRATE 1 to RUNNING + + // set MEM_MIGRATE 1 to COMPLETE + + // set MEM_MIGRATE 2 to RUNNING + + // set MEM_MIGRATE 2 to COMPLETE + + // submit KERNEL = 8 + CHECK_EQUAL(offset + 8, m_devlog.num_ops); + const acl_device_op_t *op0submit = &(m_devlog.before[3]); + const acl_device_op_t *op0running = &(m_devlog.before[4]); + const acl_device_op_t *op0complete = &(m_devlog.before[5]); + + // Device global forced reprogram + CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0submit->info.type); + CHECK_EQUAL(0, op0submit->id); + CHECK(op0submit->info.event); + CHECK_EQUAL(CL_SUBMITTED, op0submit->status); + CHECK_EQUAL(0, op0submit->info.num_printf_bytes_pending); + CHECK_EQUAL(1, op0submit->first_in_group); + CHECK_EQUAL(0, op0submit->last_in_group); + + CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0running->info.type); + CHECK_EQUAL(0, op0running->id); + CHECK(op0running->info.event); + CHECK_EQUAL(CL_RUNNING, op0running->status); + CHECK_EQUAL(0, op0running->info.num_printf_bytes_pending); + CHECK_EQUAL(1, op0running->first_in_group); + CHECK_EQUAL(0, op0running->last_in_group); + + CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0complete->info.type); + CHECK_EQUAL(0, op0complete->id); + CHECK(op0complete->info.event); + CHECK_EQUAL(CL_COMPLETE, op0complete->status); + CHECK_EQUAL(0, op0complete->info.num_printf_bytes_pending); + CHECK_EQUAL(1, op0complete->first_in_group); + CHECK_EQUAL(0, op0complete->last_in_group); + + // The device is still programmed with the same program. + CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin); + CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin); + + const acl_device_op_t *op1submit = &(m_devlog.before[10]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op1submit->info.type); + CHECK_EQUAL(k_e1, op1submit->info.event); + CHECK_EQUAL(CL_SUBMITTED, op1submit->status); + CHECK_EQUAL(0, op1submit->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op1submit->first_in_group); // reprogram is first + CHECK_EQUAL(1, op1submit->last_in_group); + + // The user-level event is linked to the kernel device op now. + CHECK_EQUAL(op1submit->id, k_e1->current_device_op->id); + + // Pretend to start the kernel + acl_print_debug_msg("Say kernel is running\n"); + ACL_LOCKED( + acl_receive_kernel_update(k_e1->current_device_op->id, CL_RUNNING)); + CHECK_EQUAL(CL_RUNNING, k_e1->current_device_op->execution_status); + + ACL_LOCKED(acl_idle_update(m_context)); + + // Now we have a "running" transition + CHECK_EQUAL(offset + 9, m_devlog.num_ops); + const acl_device_op_t *op1running = &(m_devlog.after[11]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op1running->info.type); + CHECK_EQUAL(k_e1, op1running->info.event); + CHECK_EQUAL(CL_RUNNING, op1running->status); + CHECK_EQUAL(0, op1running->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op1running->first_in_group); + CHECK_EQUAL(1, op1running->last_in_group); + + // The running status was propagated up to the user-level event. + CHECK_EQUAL(CL_RUNNING, k_e1->execution_status); + + acl_print_debug_msg("Say kernel is complete\n"); + ACL_LOCKED( + acl_receive_kernel_update(k_e1->current_device_op->id, CL_COMPLETE)); + CHECK_EQUAL(CL_COMPLETE, k_e1->current_device_op->execution_status); + + ACL_LOCKED(acl_idle_update(m_context)); + // Now we have a "complete" transition + CHECK_EQUAL(offset + 10, m_devlog.num_ops); + const acl_device_op_t *op1complete = &(m_devlog.after[12]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op1complete->info.type); + CHECK_EQUAL(k_e1, op1complete->info.event); + CHECK_EQUAL(CL_COMPLETE, op1complete->status); + CHECK_EQUAL(0, op1complete->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op1complete->first_in_group); + CHECK_EQUAL(1, op1complete->last_in_group); + + // Completion timestamp has propagated up to the user level event. + CHECK_EQUAL( + acl_platform.device_op_queue.op[op1complete->id].timestamp[CL_COMPLETE], + k_e1->timestamp[CL_COMPLETE]); + + // Completion wipes out the downlink. + CHECK_EQUAL(0, k_e1->current_device_op); + + // Launch the kernel for the second time + CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k, 1, &ue2, &k_e2)); + CHECK_EQUAL(CL_COMMAND_TASK, k_e2->cmd.type); + CHECK(m_device->def.autodiscovery_def.binary_rand_hash == + k_e2->cmd.info.ndrange_kernel.dev_bin->get_devdef() + .autodiscovery_def.binary_rand_hash); + + acl_print_debug_msg("Forcing user event completion for second kernel\n"); + CHECK_EQUAL(CL_SUCCESS, clSetUserEventStatus(ue2, CL_COMPLETE)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(ue2)); + + // Should still have the same program loaded + CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin); + CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin); + + // set MEM_MIGRATE 1 to RUNNING + + // set MEM_MIGRATE 1 to COMPLETE + + // set MEM_MIGRATE 2 to RUNNING + + // set MEM_MIGRATE 2 to COMPLETE + + // submit KERNEL = 5 + CHECK_EQUAL(offset + 15, m_devlog.num_ops); + const acl_device_op_t *op2submit = &(m_devlog.before[17]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op2submit->info.type); + CHECK_EQUAL(k_e2, op2submit->info.event); + CHECK_EQUAL(CL_SUBMITTED, op2submit->status); + CHECK_EQUAL(0, op2submit->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op2submit->first_in_group); // mem migration is first + CHECK_EQUAL(1, op2submit->last_in_group); + + // The user-level event is linked to the kernel device op now. + CHECK_EQUAL(op2submit->id, k_e2->current_device_op->id); + + // Pretend to start the kernel + acl_print_debug_msg("Say kernel is running\n"); + ACL_LOCKED( + acl_receive_kernel_update(k_e2->current_device_op->id, CL_RUNNING)); + CHECK_EQUAL(CL_RUNNING, k_e2->current_device_op->execution_status); + + ACL_LOCKED(acl_idle_update(m_context)); + + // Now we have a "running" transition + CHECK_EQUAL(offset + 16, m_devlog.num_ops); + const acl_device_op_t *op2running = &(m_devlog.after[18]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op2running->info.type); + CHECK_EQUAL(k_e2, op2running->info.event); + CHECK_EQUAL(CL_RUNNING, op2running->status); + CHECK_EQUAL(0, op2running->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op2running->first_in_group); + CHECK_EQUAL(1, op2running->last_in_group); + + // The running status was propagated up to the user-level event. + CHECK_EQUAL(CL_RUNNING, k_e2->execution_status); + + acl_print_debug_msg("Say kernel is complete\n"); + ACL_LOCKED( + acl_receive_kernel_update(k_e2->current_device_op->id, CL_COMPLETE)); + CHECK_EQUAL(CL_COMPLETE, k_e2->current_device_op->execution_status); + + ACL_LOCKED(acl_idle_update(m_context)); + // Now we have a "complete" transition + CHECK_EQUAL(offset + 17, m_devlog.num_ops); + const acl_device_op_t *op2complete = &(m_devlog.after[19]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op2complete->info.type); + CHECK_EQUAL(k_e2, op2complete->info.event); + CHECK_EQUAL(CL_COMPLETE, op2complete->status); + CHECK_EQUAL(0, op2complete->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op2complete->first_in_group); + CHECK_EQUAL(1, op2complete->last_in_group); + + // Completion timestamp has propagated up to the user level event. + CHECK_EQUAL( + acl_platform.device_op_queue.op[op2complete->id].timestamp[CL_COMPLETE], + k_e2->timestamp[CL_COMPLETE]); + + // Completion wipes out the downlink. + CHECK_EQUAL(0, k_e2->current_device_op); + + // And let go. + // (Don't check for CL_INVALID_EVENT on a second release of each of + // these events because the events might be reused.) + CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(mem)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(k_e1)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(k_e2)); + CHECK_EQUAL(CL_SUCCESS, clReleaseKernel(k)); + + // Clean up device global + m_device->def.autodiscovery_def.device_global_mem_defs.clear(); +} + +TEST(acl_kernel_reprogram_scheduler, skip_reprogram_on_start) { + // Test if reprogram is skipped if the binary currently loaded + // on the board is the same as the one to be loaded + + // Initial eager reprogram + int offset = m_devlog.num_ops; + CHECK_EQUAL(3, offset); + // Just the initial program load. + CHECK_EQUAL(m_first_dev_bin, m_device->last_bin); + CHECK_EQUAL(m_first_dev_bin, m_device->loaded_bin); + + // Pretend execution starts now + m_device->last_bin->unload_content(); + m_device->last_bin = NULL; + m_device->loaded_bin->unload_content(); + m_device->loaded_bin = NULL; + + acl_device_program_info_t *dp0 = check_dev_prog(m_program0); + m_context->reprogram_buf_read_callback = read_mem_callback; + m_context->reprogram_buf_write_callback = write_mem_callback; + + // A device side buffer + cl_int status = CL_INVALID_VALUE; + cl_mem mem = clCreateBuffer(m_context, CL_MEM_READ_WRITE, 2048, 0, &status); + CHECK_EQUAL(CL_SUCCESS, status); + CHECK(mem); + memset(mem->host_mem.aligned_ptr, 'X', mem->size); + memset(mem->block_allocation->range.begin, 'x', mem->size); + + CHECK_EQUAL(1, m_context->device_buffers_have_backing_store); + CHECK_EQUAL(0, mem->block_allocation->region->is_host_accessible); + CHECK_EQUAL(0, mem->writable_copy_on_host); + + cl_kernel k = get_kernel(m_program0); + cl_event ue = get_user_event(); + cl_event k_e = 0; + + // Launch the kernel for the first time + CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 0, sizeof(cl_mem), &mem)); + CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 1, sizeof(cl_mem), &mem)); + CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k, 1, &ue, &k_e)); + CHECK_EQUAL(CL_COMMAND_TASK, k_e->cmd.type); + CHECK(m_device->def.autodiscovery_def.binary_rand_hash == + k_e->cmd.info.ndrange_kernel.dev_bin->get_devdef() + .autodiscovery_def.binary_rand_hash); + + // last_bin and loaded_bin should still in a reset state + CHECK(m_device->last_bin == NULL); + CHECK(m_device->loaded_bin == NULL); + + acl_print_debug_msg("Forcing user event completion for first kernel\n"); + CHECK_EQUAL(CL_SUCCESS, clSetUserEventStatus(ue, CL_COMPLETE)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(ue)); + + // Since reprogram didn't occur, only last_bin should be updated + CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin); + CHECK(m_device->loaded_bin == NULL); + + // set MEM_MIGRATE 1 to RUNNING + + // set MEM_MIGRATE 1 to COMPLETE + + // set MEM_MIGRATE 2 to RUNNING + + // set MEM_MIGRATE 2 to COMPLETE + + // submit KERNEL = 5 + CHECK_EQUAL(offset + 5, m_devlog.num_ops); + const acl_device_op_t *op0submit = &(m_devlog.before[7]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op0submit->info.type); + CHECK_EQUAL(k_e, op0submit->info.event); + CHECK_EQUAL(CL_SUBMITTED, op0submit->status); + CHECK_EQUAL(0, op0submit->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op0submit->first_in_group); // mem migrate is first + CHECK_EQUAL(1, op0submit->last_in_group); + + // The user-level event is linked to the kernel device op now. + CHECK_EQUAL(op0submit->id, k_e->current_device_op->id); + + // Pretend to start the kernel + acl_print_debug_msg("Say kernel is running\n"); + ACL_LOCKED(acl_receive_kernel_update(k_e->current_device_op->id, CL_RUNNING)); + CHECK_EQUAL(CL_RUNNING, k_e->current_device_op->execution_status); + + ACL_LOCKED(acl_idle_update(m_context)); + + // Now we have a "running" transition + CHECK_EQUAL(offset + 6, m_devlog.num_ops); + const acl_device_op_t *op0running = &(m_devlog.after[8]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op0running->info.type); + CHECK_EQUAL(k_e, op0running->info.event); + CHECK_EQUAL(CL_RUNNING, op0running->status); + CHECK_EQUAL(0, op0running->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op0running->first_in_group); + CHECK_EQUAL(1, op0running->last_in_group); + + // The running status was propagated up to the user-level event. + CHECK_EQUAL(CL_RUNNING, k_e->execution_status); + + acl_print_debug_msg("Say kernel is complete\n"); + ACL_LOCKED( + acl_receive_kernel_update(k_e->current_device_op->id, CL_COMPLETE)); + CHECK_EQUAL(CL_COMPLETE, k_e->current_device_op->execution_status); + + ACL_LOCKED(acl_idle_update(m_context)); + // Now we have a "complete" transition + CHECK_EQUAL(offset + 7, m_devlog.num_ops); + const acl_device_op_t *op0complete = &(m_devlog.after[9]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op0complete->info.type); + CHECK_EQUAL(k_e, op0complete->info.event); + CHECK_EQUAL(CL_COMPLETE, op0complete->status); + CHECK_EQUAL(0, op0complete->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op0complete->first_in_group); + CHECK_EQUAL(1, op0complete->last_in_group); + + // Completion timestamp has propagated up to the user level event. + CHECK_EQUAL( + acl_platform.device_op_queue.op[op0complete->id].timestamp[CL_COMPLETE], + k_e->timestamp[CL_COMPLETE]); + + // Completion wipes out the downlink. + CHECK_EQUAL(0, k_e->current_device_op); + + // And let go. + // (Don't check for CL_INVALID_EVENT on a second release of each of + // these events because the events might be reused.) + CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(mem)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(k_e)); + CHECK_EQUAL(CL_SUCCESS, clReleaseKernel(k)); +} + TEST(acl_kernel_reprogram_scheduler, use_host_buf_as_arg) { // Must be able to use a host-side buffer as a kernel argument. cl_int status = 0;