diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 7bc00e128e5..3b9061701e6 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -799,6 +799,33 @@ void ComputeGraph::prepare_pipelines() {
   pipeline_descriptors_ = std::unordered_set<
       vkapi::ComputePipelineCache::Key,
       vkapi::ComputePipelineCache::Hasher>();
+
+  const size_t total_node_count = execute_nodes_.size();
+  size_t init_threshold = config_.execute_initial_threshold_node_count;
+  size_t count_threshold = config_.execute_threshold_node_count;
+
+  // If max command buffer count is set, we need to adjust the thresholds to
+  // accommodate execution within the limit, if total command buffers with
+  // current thresholds would exceed execute_max_cmds
+  if (config_.execute_max_cmds > 0) {
+    // Worse case scenario we have one command buffer for nodes before init
+    // threshold and config_.execute_max_cmds - 1 command buffers for the rest
+    // of dispatches
+
+    // If command buffers created after offsetting init_threshold would exceed
+    // max command buffer count, we need to adjust init and count thresholds
+    const bool slicing_exceeds_max_cmds = (total_node_count - init_threshold) >
+        count_threshold * (config_.execute_max_cmds - 1);
+    if (total_node_count > init_threshold && slicing_exceeds_max_cmds) {
+      // Increase count threshold so remaining nodes after offsetting init fits
+      // in config_.execute_max_cmds - 1
+      count_threshold = static_cast<size_t>(ceil(
+          (total_node_count - init_threshold) /
+          double(config_.execute_max_cmds - 1)));
+    }
+  }
+
+  execute_threshold_node_count_ = count_threshold;
 }
 
 void ComputeGraph::submit_current_cmd(const bool final_use) {
@@ -888,6 +915,7 @@ void ComputeGraph::execute() {
     context_->set_cmd(/*reusable = */ true);
 
     context_->cmd_reset_querypool();
+    const size_t total_node_count = execute_nodes_.size();
     uint32_t encoded_node_count = 0;
 
     for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
@@ -900,11 +928,13 @@ void ComputeGraph::execute() {
       const bool reached_threshold =
           encoded_node_count >= config_.execute_initial_threshold_node_count &&
           ((encoded_node_count - config_.execute_initial_threshold_node_count) %
-               config_.execute_threshold_node_count ==
+               execute_threshold_node_count_ ==
            0);
 
       // Create a new command buffer when threashold is reached
-      if (reached_threshold) {
+      // But avoid it if this is the last node, since last cmd buf is submitted
+      // after the loop
+      if (reached_threshold && encoded_node_count != total_node_count) {
         context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false);
         deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
         context_->set_cmd(true);
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 34b14250314..3baa4df4de6 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -207,6 +207,14 @@ class ComputeGraph final {
   // current Context's command buffer is submitted now.
   size_t staging_nbytes_in_cmd_ = 0;
 
+  // Represents the nodes to wait before submitting commands.
+  // If command buffers created with config.execute_threshold_node_count exceeds
+  // config.execute_max_cmds, then execute_threshold_node_count will be
+  // increased to fit command buffers within the limit. Otherwise,
+  // execute_threshold_node_count will be set to
+  // config.execute_threshold_node_count.
+  size_t execute_threshold_node_count_ = 0;
+
  public:
   //
   // Accessors
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
index 08505aa3345..aa5cd8f8c4e 100644
--- a/backends/vulkan/runtime/graph/GraphConfig.h
+++ b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -61,6 +61,10 @@ struct GraphConfig final {
   // by taking more advantage of parallelism between the CPU and GPU.
   size_t execute_initial_threshold_node_count = 0;
 
+  // If this number is greater than 0 then, during execute create at most this
+  // many command buffers.
+  size_t execute_max_cmds = 0;
+
   vkapi::Adapter* external_adapter;
 
   // Generate a default graph config with pre-configured settings