diff --git a/include/acl_kernel_if.h b/include/acl_kernel_if.h index d52d69df..8ec8ef31 100644 --- a/include/acl_kernel_if.h +++ b/include/acl_kernel_if.h @@ -86,6 +86,8 @@ typedef struct { // CRA address offset for backwards compatibility unsigned int cra_address_offset = 8; + // Kernel static image cache for trackinig changed work dimensions, etc. + std::vector> static_img_cache; // Kernel argument cache for trackinig changed arguments std::vector> accel_arg_cache; } acl_kernel_if; diff --git a/include/acl_types.h b/include/acl_types.h index a3556da5..b083e774 100644 --- a/include/acl_types.h +++ b/include/acl_types.h @@ -441,7 +441,7 @@ class acl_device_program_info_t { // don't expect it. #pragma pack(push, 4) // These are the bytes written to global memory for a kernel invocation. -typedef struct { +typedef struct acl_dev_kernel_invocation_image { // The activation_id is the index into the device op queue. // The value at acl_platform.device_op_queue[activation_id] will be // updated asynchronously by the HAL, so its address must remain stable. @@ -485,6 +485,19 @@ typedef struct { char *arg_value; size_t arg_value_size; + // Define constructor to initialize the invocation image to default values + // Hard code for now + acl_dev_kernel_invocation_image() + : activation_id(0), accel_id(0), work_dim(1), work_group_size(1), + padding(0), arg_value(NULL), arg_value_size(0) { + for (unsigned i = 0; i < 3; ++i) { + global_work_size[i] = 1; + num_groups[i] = 1; + local_work_size[i] = 1; + global_work_offset[i] = 0; + } + } + } acl_dev_kernel_invocation_image_t; // Invocation image structure that matches the 18.1 CRA layout. diff --git a/src/acl_kernel_if.cpp b/src/acl_kernel_if.cpp index b31a7567..9465aa8f 100644 --- a/src/acl_kernel_if.cpp +++ b/src/acl_kernel_if.cpp @@ -97,22 +97,23 @@ void acl_kernel_if_register_callbacks( // **************************** Utility Functions *************************** // ************************************************************************** void print_invocation_image(acl_kernel_if *kern, char *image_ptr, - size_t image_size, unsigned int offset, - bool is_static) { + size_t image_size, size_t size_to_write, + unsigned int csr_offset, bool is_static, + bool is_write = true, size_t print_offset = 0) { std::string image_type = is_static ? "stat" : "args"; - for (uintptr_t p = 0; p < image_size; p += sizeof(int)) { + std::string overwrite = is_write ? "Writing" : "Keeping"; + size_t print_end = print_offset + size_to_write; + assert(print_end <= image_size && "printing invocation image out of bound"); + for (uintptr_t p = print_offset; p < print_end; p += sizeof(int)) { unsigned int pword = 0; - if (p + sizeof(int) > image_size) { - for (size_t i = 0; i < image_size - p; i += sizeof(char)) { - safe_memcpy(((char *)(&pword)) + i, image_ptr + p + i, sizeof(char), - sizeof(int), image_size - p - i); - } - } else { - pword = *(unsigned int *)(image_ptr + p); - } + uintptr_t cpy_size = + (print_end - p > sizeof(int)) ? sizeof(int) : (print_end - p); + safe_memcpy(((char *)(&pword)), image_ptr + p, cpy_size * sizeof(char), + sizeof(int), (print_end - p) * sizeof(char)); ACL_KERNEL_IF_DEBUG_MSG_VERBOSE( - kern, 2, ":: Writing inv image (%s) [%2d] @%8p := %4x\n", - image_type.c_str(), (int)(p), (void *)(offset + p), pword); + kern, 2, ":: %s inv image (%s) [%2d] @%8p := %4x\n", + overwrite.c_str(), image_type.c_str(), (int)(p), + (void *)(csr_offset + p), pword); } } @@ -879,6 +880,7 @@ int acl_kernel_if_update(const acl_device_def_autodiscovery_t &devdef, if (kern->num_accel > 0) { kern->accel_job_ids.resize(kern->num_accel); kern->accel_invoc_queue_depth.resize(kern->num_accel); + kern->static_img_cache.resize(kern->num_accel); kern->accel_arg_cache.resize(kern->num_accel); // Kernel IRQ is a separate thread. Need to use circular buffer to make this @@ -886,7 +888,15 @@ int acl_kernel_if_update(const acl_device_def_autodiscovery_t &devdef, kern->accel_queue_front.resize(kern->num_accel); kern->accel_queue_back.resize(kern->num_accel); + acl_dev_kernel_invocation_image_t default_invocation; + size_t image_size_static = + (size_t)((uintptr_t) & (default_invocation.arg_value) - (uintptr_t) & + (default_invocation.work_dim)); + for (unsigned a = 0; a < kern->num_accel; ++a) { + kern->static_img_cache[a] = std::make_unique(image_size_static); + memcpy(kern->static_img_cache[a].get(), + (char *)(&(default_invocation.work_dim)), image_size_static); unsigned int max_same_accel_launches = devdef.accel[a].fast_launch_depth + 1; // +1, because fast launch depth does not account for the running kernel @@ -1134,27 +1144,37 @@ void acl_kernel_if_launch_kernel_on_custom_sof( (image->work_dim)); } - if ((kern->io.debug_verbosity) >= 2) { - // We only write the static part of the invocation image if the kernel uses - // CRA control. - if (!kern->streaming_control_signal_names[accel_id]) { - print_invocation_image(kern, (char *)image_p, image_size_static, offset, - true); - } - - if (kern->csr_version.has_value() && - (kern->csr_version != CSR_VERSION_ID_18_1)) { - print_invocation_image(kern, image->arg_value, image->arg_value_size, - (unsigned int)(offset + image_size_static), false); - } - } - // When csr version is 18.1, the kernel args is part of the image. otherwise, // it is in dynamic memory. Only write the static part of the invocation // image if this kernel uses CRA control. if (!kern->streaming_control_signal_names[accel_id]) { - acl_kernel_cra_write_block(kern, accel_id, offset, (unsigned int *)image_p, - image_size_static); + if (kern->csr_version == CSR_VERSION_ID_18_1) { + // Just write everything for older CSR version + if ((kern->io.debug_verbosity) >= 2) { + print_invocation_image(kern, (char *)image_p, image_size_static, + image_size_static, offset, true); + } + acl_kernel_cra_write_block(kern, accel_id, offset, + (unsigned int *)image_p, image_size_static); + } else { + char *img_cache_ptr = kern->static_img_cache[accel_id].get(); + assert(img_cache_ptr && "kernel image cache not initialized!"); + if (memcmp(img_cache_ptr, (char *)image_p, image_size_static) != 0) { + // Something changed in static part of the invocation image, + // write everything to csr + if ((kern->io.debug_verbosity) >= 2) { + print_invocation_image(kern, (char *)image_p, image_size_static, + image_size_static, offset, true); + } + acl_kernel_cra_write_block(kern, accel_id, offset, + (unsigned int *)image_p, image_size_static); + memcpy(img_cache_ptr, (char *)image_p, image_size_static); + } else if ((kern->io.debug_verbosity) >= 2) { + // Nothing's changed, just print the static part of the invocation image + print_invocation_image(kern, (char *)image_p, image_size_static, + image_size_static, offset, true, false); + } + } } bool accel_has_agent_args = false; @@ -1162,9 +1182,17 @@ void acl_kernel_if_launch_kernel_on_custom_sof( (kern->csr_version != CSR_VERSION_ID_18_1 && image->arg_value_size > 0)) { accel_has_agent_args = true; if (!kern->accel_arg_cache[accel_id]) { + // The first time invoking the kernel, just write all the arguments + if ((kern->io.debug_verbosity) >= 2) { + print_invocation_image(kern, image->arg_value, image->arg_value_size, + image->arg_value_size, + (unsigned int)(offset + image_size_static), + false); + } acl_kernel_cra_write_block( kern, accel_id, offset + (unsigned int)image_size_static, (unsigned int *)image->arg_value, image->arg_value_size); + // Initialize kernel argument cache and cache the values kern->accel_arg_cache[accel_id] = std::make_unique(image->arg_value_size); memcpy(kern->accel_arg_cache[accel_id].get(), (char *)image->arg_value, @@ -1177,6 +1205,7 @@ void acl_kernel_if_launch_kernel_on_custom_sof( size_t cmp_size = (image->arg_value_size - step) > sizeof(int) ? sizeof(int) : (image->arg_value_size - step); + // Find range of changed arguments and record size of that block while (cmp_size > 0 && memcmp(arg_cache_ptr + step + size_to_write, image->arg_value + step + size_to_write, cmp_size) != 0) { @@ -1187,8 +1216,23 @@ void acl_kernel_if_launch_kernel_on_custom_sof( : (image->arg_value_size - step - size_to_write); } if (size_to_write == 0) { - step += (unsigned)sizeof(int); + // Current compared block is the same as before, skipping write + size_t size_to_skip = (image->arg_value_size - step > sizeof(int)) + ? sizeof(int) + : (image->arg_value_size - step); + if ((kern->io.debug_verbosity) >= 2) { + print_invocation_image( + kern, image->arg_value, image->arg_value_size, size_to_skip, + (unsigned int)(offset + image_size_static), false, false, step); + } + step += size_to_skip; } else { + // Write the changed argument block to csr + if ((kern->io.debug_verbosity) >= 2) { + print_invocation_image( + kern, image->arg_value, image->arg_value_size, size_to_write, + (unsigned int)(offset + image_size_static), false, true, step); + } acl_kernel_cra_write_block( kern, accel_id, offset + (unsigned int)(image_size_static + step), (unsigned int *)(image->arg_value + step), size_to_write); @@ -1692,6 +1736,7 @@ void acl_kernel_if_close(acl_kernel_if *kern) { kern->accel_invoc_queue_depth.clear(); kern->accel_queue_front.clear(); kern->accel_queue_back.clear(); + kern->static_img_cache.clear(); kern->accel_arg_cache.clear(); kern->autorun_profiling_kernel_id = -1; }