From 264fc3b711f4c1bad46ceda2f5056918c711904c Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Thu, 7 Aug 2025 07:20:56 -0700
Subject: [PATCH] [ET-VK] 8/n Split dispatches between multiple command
 buffers. This diff adds a config to limit the maximum number of command
 buffers created when splitting execution between multiple command buffers.

Pull Request resolved: https://github.com/pytorch/executorch/pull/13113

This diff introduces a new configuration option, `execute_max_cmds`, to limit the maximum number of command buffers created when splitting execution between multiple command buffers. This feature allows for more efficient management of command buffers, particularly in scenarios where the number of nodes in the graph is large.
ghstack-source-id: 301393815
@exported-using-ghexport

Differential Revision: [D79575908](https://our.internmc.facebook.com/intern/diff/D79575908/)
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 34 +++++++++++++++++--
 backends/vulkan/runtime/graph/ComputeGraph.h  |  8 +++++
 backends/vulkan/runtime/graph/GraphConfig.h   |  4 +++
 3 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 7bc00e128e5..3b9061701e6 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -799,6 +799,33 @@ void ComputeGraph::prepare_pipelines() {
   pipeline_descriptors_ = std::unordered_set<
       vkapi::ComputePipelineCache::Key,
       vkapi::ComputePipelineCache::Hasher>();
+
+  const size_t total_node_count = execute_nodes_.size();
+  size_t init_threshold = config_.execute_initial_threshold_node_count;
+  size_t count_threshold = config_.execute_threshold_node_count;
+
+  // If max command buffer count is set, we need to adjust the thresholds to
+  // accommodate execution within the limit, if total command buffers with
+  // current thresholds would exceed execute_max_cmds
+  if (config_.execute_max_cmds > 0) {
+    // Worse case scenario we have one command buffer for nodes before init
+    // threshold and config_.execute_max_cmds - 1 command buffers for the rest
+    // of dispatches
+
+    // If command buffers created after offsetting init_threshold would exceed
+    // max command buffer count, we need to adjust init and count thresholds
+    const bool slicing_exceeds_max_cmds = (total_node_count - init_threshold) >
+        count_threshold * (config_.execute_max_cmds - 1);
+    if (total_node_count > init_threshold && slicing_exceeds_max_cmds) {
+      // Increase count threshold so remaining nodes after offsetting init fits
+      // in config_.execute_max_cmds - 1
+      count_threshold = static_cast<size_t>(ceil(
+          (total_node_count - init_threshold) /
+          double(config_.execute_max_cmds - 1)));
+    }
+  }
+
+  execute_threshold_node_count_ = count_threshold;
 }
 
 void ComputeGraph::submit_current_cmd(const bool final_use) {
@@ -888,6 +915,7 @@ void ComputeGraph::execute() {
     context_->set_cmd(/*reusable = */ true);
 
     context_->cmd_reset_querypool();
+    const size_t total_node_count = execute_nodes_.size();
     uint32_t encoded_node_count = 0;
 
     for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
@@ -900,11 +928,13 @@ void ComputeGraph::execute() {
       const bool reached_threshold =
           encoded_node_count >= config_.execute_initial_threshold_node_count &&
           ((encoded_node_count - config_.execute_initial_threshold_node_count) %
-               config_.execute_threshold_node_count ==
+               execute_threshold_node_count_ ==
            0);
 
       // Create a new command buffer when threashold is reached
-      if (reached_threshold) {
+      // But avoid it if this is the last node, since last cmd buf is submitted
+      // after the loop
+      if (reached_threshold && encoded_node_count != total_node_count) {
         context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false);
         deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
         context_->set_cmd(true);
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 34b14250314..3baa4df4de6 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -207,6 +207,14 @@ class ComputeGraph final {
   // current Context's command buffer is submitted now.
   size_t staging_nbytes_in_cmd_ = 0;
 
+  // Represents the nodes to wait before submitting commands.
+  // If command buffers created with config.execute_threshold_node_count exceeds
+  // config.execute_max_cmds, then execute_threshold_node_count will be
+  // increased to fit command buffers within the limit. Otherwise,
+  // execute_threshold_node_count will be set to
+  // config.execute_threshold_node_count.
+  size_t execute_threshold_node_count_ = 0;
+
  public:
   //
   // Accessors
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
index 08505aa3345..aa5cd8f8c4e 100644
--- a/backends/vulkan/runtime/graph/GraphConfig.h
+++ b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -61,6 +61,10 @@ struct GraphConfig final {
   // by taking more advantage of parallelism between the CPU and GPU.
   size_t execute_initial_threshold_node_count = 0;
 
+  // If this number is greater than 0 then, during execute create at most this
+  // many command buffers.
+  size_t execute_max_cmds = 0;
+
   vkapi::Adapter* external_adapter;
 
   // Generate a default graph config with pre-configured settings