diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 7bc00e128e5..3b9061701e6 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -799,6 +799,33 @@ void ComputeGraph::prepare_pipelines() { pipeline_descriptors_ = std::unordered_set< vkapi::ComputePipelineCache::Key, vkapi::ComputePipelineCache::Hasher>(); + + const size_t total_node_count = execute_nodes_.size(); + size_t init_threshold = config_.execute_initial_threshold_node_count; + size_t count_threshold = config_.execute_threshold_node_count; + + // If max command buffer count is set, we need to adjust the thresholds to + // accommodate execution within the limit, if total command buffers with + // current thresholds would exceed execute_max_cmds + if (config_.execute_max_cmds > 0) { + // Worse case scenario we have one command buffer for nodes before init + // threshold and config_.execute_max_cmds - 1 command buffers for the rest + // of dispatches + + // If command buffers created after offsetting init_threshold would exceed + // max command buffer count, we need to adjust init and count thresholds + const bool slicing_exceeds_max_cmds = (total_node_count - init_threshold) > + count_threshold * (config_.execute_max_cmds - 1); + if (total_node_count > init_threshold && slicing_exceeds_max_cmds) { + // Increase count threshold so remaining nodes after offsetting init fits + // in config_.execute_max_cmds - 1 + count_threshold = static_cast(ceil( + (total_node_count - init_threshold) / + double(config_.execute_max_cmds - 1))); + } + } + + execute_threshold_node_count_ = count_threshold; } void ComputeGraph::submit_current_cmd(const bool final_use) { @@ -888,6 +915,7 @@ void ComputeGraph::execute() { context_->set_cmd(/*reusable = */ true); context_->cmd_reset_querypool(); + const size_t total_node_count = execute_nodes_.size(); uint32_t encoded_node_count = 0; for (std::unique_ptr& node : execute_nodes_) { @@ -900,11 +928,13 @@ void ComputeGraph::execute() { const bool reached_threshold = encoded_node_count >= config_.execute_initial_threshold_node_count && ((encoded_node_count - config_.execute_initial_threshold_node_count) % - config_.execute_threshold_node_count == + execute_threshold_node_count_ == 0); // Create a new command buffer when threashold is reached - if (reached_threshold) { + // But avoid it if this is the last node, since last cmd buf is submitted + // after the loop + if (reached_threshold && encoded_node_count != total_node_count) { context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false); deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd())); context_->set_cmd(true); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 34b14250314..3baa4df4de6 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -207,6 +207,14 @@ class ComputeGraph final { // current Context's command buffer is submitted now. size_t staging_nbytes_in_cmd_ = 0; + // Represents the nodes to wait before submitting commands. + // If command buffers created with config.execute_threshold_node_count exceeds + // config.execute_max_cmds, then execute_threshold_node_count will be + // increased to fit command buffers within the limit. Otherwise, + // execute_threshold_node_count will be set to + // config.execute_threshold_node_count. + size_t execute_threshold_node_count_ = 0; + public: // // Accessors diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h index 08505aa3345..aa5cd8f8c4e 100644 --- a/backends/vulkan/runtime/graph/GraphConfig.h +++ b/backends/vulkan/runtime/graph/GraphConfig.h @@ -61,6 +61,10 @@ struct GraphConfig final { // by taking more advantage of parallelism between the CPU and GPU. size_t execute_initial_threshold_node_count = 0; + // If this number is greater than 0 then, during execute create at most this + // many command buffers. + size_t execute_max_cmds = 0; + vkapi::Adapter* external_adapter; // Generate a default graph config with pre-configured settings