intel
diff --git a/‎csrc/cpu/aten/MoE.cpp
Lines changed: 185 additions & 0 deletions b/‎csrc/cpu/aten/MoE.cpp
Lines changed: 185 additions & 0 deletions
diff --git a/‎csrc/cpu/aten/MoE.h
Lines changed: 101 additions & 0 deletions b/‎csrc/cpu/aten/MoE.h
Lines changed: 101 additions & 0 deletions
diff --git a/‎examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
Lines changed: 4 additions & 2 deletions b/‎examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
Lines changed: 4 additions & 2 deletions
@@ -7,6 +7,7 @@ namespace cpu {
 
 IPEX_DEFINE_DISPATCH(mixtral_moe_tpp_kernel_stub);
 IPEX_DEFINE_DISPATCH(mixtral_moe_woq_kernel_stub);
+IPEX_DEFINE_DISPATCH(deepseek_moe_woq_kernel_stub);
 IPEX_DEFINE_DISPATCH(mixtral_moe_kernel_stub);
 
 at::Tensor mixtral_moe_tpp(
@@ -38,6 +39,41 @@ at::Tensor mixtral_moe_tpp(
       is_distributed);
 }
 
+at::Tensor deepseek_moe_tpp(
+    const at::Tensor& hidden_states,
+    const at::Tensor& expert_mask,
+    const std::vector<at::Tensor>& gate_wei,
+    const std::vector<at::Tensor>& up_wei,
+    const std::vector<at::Tensor>& down_wei,
+    bool tpp_fallback,
+    const at::Tensor& routing_weights,
+    at::Tensor& output,
+    bool is_distributed) {
+  RECORD_FUNCTION("ipex::deepseek_moe_tpp", c10::ArrayRef<c10::IValue>({}));
+
+  int num_experts = gate_wei.size();
+  for (auto i = 0; i < num_experts; i++) {
+    auto non_zero = expert_mask[i].nonzero();
+    if (non_zero.sizes()[0] == 0)
+      continue;
+    auto idx = non_zero.select(1, 0);
+    auto top_x = non_zero.select(1, 1);
+    output = mixtral_moe_tpp_kernel_stub(
+        kCPU,
+        hidden_states,
+        top_x,
+        idx,
+        gate_wei[i],
+        up_wei[i],
+        down_wei[i],
+        tpp_fallback,
+        routing_weights,
+        output,
+        is_distributed);
+  }
+  return output;
+}
+
 at::Tensor mixtral_moe(
     const at::Tensor& hidden_states,
     const at::Tensor& top_x,
@@ -72,6 +108,87 @@ at::Tensor mixtral_moe(
       output,
       is_distributed);
 }
+
+at::Tensor deepseek_moe(
+    const at::Tensor& hidden_states,
+    const at::Tensor& expert_mask,
+    const std::vector<at::Tensor>& gate_wei,
+    const std::vector<c10::intrusive_ptr<LinearOpContext>>& gate_op_ctx,
+    const std::vector<at::Tensor>& up_wei,
+    const std::vector<c10::intrusive_ptr<LinearOpContext>>& up_op_ctx,
+    const std::vector<at::Tensor>& down_wei,
+    const std::vector<c10::intrusive_ptr<LinearOpContext>>& down_op_ctx,
+    const at::Tensor& routing_weights,
+    at::Tensor& output,
+    bool is_distributed) {
+  RECORD_FUNCTION("ipex::deepseek_moe", c10::ArrayRef<c10::IValue>({}));
+
+  int num_experts = gate_wei.size();
+  for (auto i = 0; i < num_experts; i++) {
+    auto non_zero = expert_mask[i].nonzero();
+    if (non_zero.sizes()[0] == 0)
+      continue;
+    auto idx = non_zero.select(1, 0);
+    auto top_x = non_zero.select(1, 1);
+
+    output = mixtral_moe_kernel_stub(
+        kCPU,
+        hidden_states,
+        top_x,
+        idx,
+        gate_wei[i],
+        gate_op_ctx[i]->get_data_handle(),
+        up_wei[i],
+        up_op_ctx[i]->get_data_handle(),
+        down_wei[i],
+        down_op_ctx[i]->get_data_handle(),
+        true,
+        routing_weights,
+        output,
+        is_distributed);
+  }
+  return output;
+}
+
+at::Tensor deepseek_moe_mkl(
+    const at::Tensor& hidden_states,
+    const at::Tensor& expert_mask,
+    const std::vector<at::Tensor>& gate_wei,
+    const std::vector<c10::intrusive_ptr<MKLOpContext>>& gate_op_ctx,
+    const std::vector<at::Tensor>& up_wei,
+    const std::vector<c10::intrusive_ptr<MKLOpContext>>& up_op_ctx,
+    const std::vector<at::Tensor>& down_wei,
+    const std::vector<c10::intrusive_ptr<MKLOpContext>>& down_op_ctx,
+    const at::Tensor& routing_weights,
+    at::Tensor& output,
+    bool is_distributed) {
+  RECORD_FUNCTION("ipex::deepseek_moe_mkl", c10::ArrayRef<c10::IValue>({}));
+
+  int num_experts = gate_wei.size();
+  for (auto i = 0; i < num_experts; i++) {
+    auto non_zero = expert_mask[i].nonzero();
+    if (non_zero.sizes()[0] == 0)
+      continue;
+    auto idx = non_zero.select(1, 0);
+    auto top_x = non_zero.select(1, 1);
+    output = mixtral_moe_kernel_stub(
+        kCPU,
+        hidden_states,
+        top_x,
+        idx,
+        gate_wei[i],
+        gate_op_ctx[i]->get_data_handle(),
+        up_wei[i],
+        up_op_ctx[i]->get_data_handle(),
+        down_wei[i],
+        down_op_ctx[i]->get_data_handle(),
+        false,
+        routing_weights,
+        output,
+        is_distributed);
+  }
+  return output;
+}
 at::Tensor mixtral_moe_woq(
     const at::Tensor& hidden_states,
     const at::Tensor& top_x,
@@ -98,6 +215,38 @@ at::Tensor mixtral_moe_woq(
       output,
       is_distributed);
 }
+at::Tensor deepseek_moe_woq(
+    const at::Tensor& hidden_states,
+    const at::Tensor& expert_mask,
+    const std::vector<c10::intrusive_ptr<WoqLinearOpContext>>& gate_ctx,
+    const std::vector<c10::intrusive_ptr<WoqLinearOpContext>>& up_ctx,
+    const std::vector<c10::intrusive_ptr<WoqLinearOpContext>>& down_ctx,
+    const at::Tensor& routing_weights,
+    at::Tensor& output,
+    bool is_distributed) {
+  RECORD_FUNCTION("ipex::deepseek_moe_woq", c10::ArrayRef<c10::IValue>({}));
+
+  int num_experts = gate_ctx.size();
+  for (auto i = 0; i < num_experts; i++) {
+    auto non_zero = expert_mask[i].nonzero();
+    if (non_zero.sizes()[0] == 0)
+      continue;
+    auto idx = non_zero.select(1, 0);
+    auto top_x = non_zero.select(1, 1);
+    output = mixtral_moe_woq_kernel_stub(
+        kCPU,
+        hidden_states,
+        top_x,
+        idx,
+        gate_ctx[i]->get_data_handle(),
+        up_ctx[i]->get_data_handle(),
+        down_ctx[i]->get_data_handle(),
+        routing_weights,
+        output,
+        is_distributed);
+  }
+  return output;
+}
 } // namespace cpu
 } // namespace torch_ipex
 
@@ -112,17 +261,53 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) {
       "mixtral_moe_tpp",
       c10::DispatchKey::CPU,
       torch_ipex::cpu::mixtral_moe_tpp);
+  m.def(
+      "deepseek_moe_tpp(Tensor hidden_states, Tensor expert_mask, Tensor[] gate_wei, \
+      Tensor[] up_wei, Tensor[] down_wei, bool tpp_fallback, Tensor routing_weights, \
+      Tensor output, bool is_distributed) -> Tensor");
+  m.impl(
+      "deepseek_moe_tpp",
+      c10::DispatchKey::CPU,
+      torch_ipex::cpu::deepseek_moe_tpp);
   m.def(
       "mixtral_moe(Tensor hidden_states, Tensor top_x, Tensor idx, Tensor gate_wei, \
       Tensor gate_op_ctx, Tensor up_wei, Tensor up_op_ctx, Tensor down_wei, \
       Tensor down_op_ctx, bool use_dnnl, Tensor routing_weights, Tensor output, bool is_distributed) -> Tensor");
   m.impl("mixtral_moe", c10::DispatchKey::CPU, torch_ipex::cpu::mixtral_moe);
+  m.def(
+      "deepseek_moe(Tensor hidden_states, Tensor expert_mask, Tensor[] gate_wei, \
+      __torch__.torch.classes.ipex_prepack.LinearOpContext[] gate_op_ctx, Tensor[] up_wei, \
+      __torch__.torch.classes.ipex_prepack.LinearOpContext[] up_op_ctx, Tensor[] down_wei, \
+      __torch__.torch.classes.ipex_prepack.LinearOpContext[] down_op_ctx, Tensor routing_weights, \
+      Tensor output, bool is_distributed) -> Tensor");
+  m.impl("deepseek_moe", c10::DispatchKey::CPU, torch_ipex::cpu::deepseek_moe);
+  m.def(
+      "deepseek_moe_mkl(Tensor hidden_states, Tensor expert_mask, Tensor[] gate_wei, \
+      __torch__.torch.classes.ipex_prepack.MKLOpContext[] gate_op_ctx, Tensor[] up_wei, \
+      __torch__.torch.classes.ipex_prepack.MKLOpContext[] up_op_ctx, \
+      Tensor[] down_wei, __torch__.torch.classes.ipex_prepack.MKLOpContext[] down_op_ctx, \
+      Tensor routing_weights, Tensor output, bool is_distributed) -> Tensor");
+  m.impl(
+      "deepseek_moe_mkl",
+      c10::DispatchKey::CPU,
+      torch_ipex::cpu::deepseek_moe_mkl);
   m.def(
       "mixtral_moe_woq(Tensor hidden_states, Tensor top_x, Tensor idx, Tensor gate_wei, \
       Tensor up_wei, Tensor down_wei, Tensor routing_weights, Tensor output, bool is_distributed) -> Tensor");
   m.impl(
       "mixtral_moe_woq",
       c10::DispatchKey::CPU,
       torch_ipex::cpu::mixtral_moe_woq);
+  m.def(
+      "deepseek_moe_woq(Tensor hidden_states, Tensor expert_mask, \
+      __torch__.torch.classes.ipex_prepack.WoqLinearOpContext[] gate_ctx, \
+      __torch__.torch.classes.ipex_prepack.WoqLinearOpContext[] up_ctx, \
+      __torch__.torch.classes.ipex_prepack.WoqLinearOpContext[] down_ctx, \
+      Tensor routing_weights, Tensor output, bool is_distributed) -> Tensor");
+
+  m.impl(
+      "deepseek_moe_woq",
+      c10::DispatchKey::CPU,
+      torch_ipex::cpu::deepseek_moe_woq);
 }
 } // namespace
@@ -2,6 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <dyndisp/DispatchStub.h>
+#include "Linear.h"
 
 namespace torch_ipex {
 namespace cpu {
@@ -16,6 +17,35 @@ at::Tensor mixtral_moe_tpp(
     const at::Tensor&,
     at::Tensor&,
     bool);
+at::Tensor deepseek_moe_tpp(
+    const at::Tensor&,
+    const at::Tensor&,
+    const std::vector<at::Tensor>&,
+    const std::vector<at::Tensor>&,
+    const std::vector<at::Tensor>&,
+    bool,
+    const at::Tensor&,
+    at::Tensor&,
+    bool);
+at::Tensor mixtral_moe_woq(
+    const at::Tensor&,
+    const at::Tensor&,
+    const at::Tensor&,
+    const at::Tensor&,
+    const at::Tensor&,
+    const at::Tensor&,
+    const at::Tensor&,
+    at::Tensor&,
+    bool);
+at::Tensor deepseek_moe_woq(
+    const at::Tensor&,
+    const at::Tensor&,
+    const std::vector<c10::intrusive_ptr<WoqLinearOpContext>>&,
+    const std::vector<c10::intrusive_ptr<WoqLinearOpContext>>&,
+    const std::vector<c10::intrusive_ptr<WoqLinearOpContext>>&,
+    const at::Tensor&,
+    at::Tensor&,
+    bool);
 at::Tensor mixtral_moe_woq(
     const at::Tensor&,
     const at::Tensor&,
@@ -40,6 +70,30 @@ at::Tensor mixtral_moe(
     const at::Tensor&,
     at::Tensor&,
     bool);
+at::Tensor deepseek_moe(
+    const at::Tensor&,
+    const at::Tensor&,
+    const std::vector<at::Tensor>&,
+    const std::vector<c10::intrusive_ptr<LinearOpContext>>&,
+    const std::vector<at::Tensor>&,
+    const std::vector<c10::intrusive_ptr<LinearOpContext>>&,
+    const std::vector<at::Tensor>&,
+    const std::vector<c10::intrusive_ptr<LinearOpContext>>&,
+    const at::Tensor&,
+    at::Tensor&,
+    bool);
+at::Tensor deepseek_moe_mkl(
+    const at::Tensor&,
+    const at::Tensor&,
+    const std::vector<at::Tensor>&,
+    const std::vector<c10::intrusive_ptr<MKLOpContext>>&,
+    const std::vector<at::Tensor>&,
+    const std::vector<c10::intrusive_ptr<MKLOpContext>>&,
+    const std::vector<at::Tensor>&,
+    const std::vector<c10::intrusive_ptr<MKLOpContext>>&,
+    const at::Tensor&,
+    at::Tensor&,
+    bool);
 using mixtral_moe_tpp_kernel_fn = at::Tensor (*)(
     const at::Tensor& hidden_states,
     const at::Tensor& top_x,
@@ -51,6 +105,16 @@ using mixtral_moe_tpp_kernel_fn = at::Tensor (*)(
     const at::Tensor& routing_weights,
     at::Tensor& output,
     bool is_distributed);
+using deepseek_moe_tpp_kernel_fn = at::Tensor (*)(
+    const at::Tensor& hidden_states,
+    const at::Tensor& expert_mask,
+    const std::vector<at::Tensor>& gate_wei,
+    const std::vector<at::Tensor>& up_wei,
+    const std::vector<at::Tensor>& down_wei,
+    bool tpp_fallback,
+    const at::Tensor& routing_weights,
+    at::Tensor& output,
+    bool is_distributed);
 using mixtral_moe_woq_kernel_fn = at::Tensor (*)(
     const at::Tensor& hidden_states,
     const at::Tensor& top_x,
@@ -61,6 +125,15 @@ using mixtral_moe_woq_kernel_fn = at::Tensor (*)(
     const at::Tensor& routing_weights,
     at::Tensor& output,
     bool is_distributed);
+using deepseek_moe_woq_kernel_fn = at::Tensor (*)(
+    const at::Tensor& hidden_states,
+    const at::Tensor& expert_mask,
+    const std::vector<c10::intrusive_ptr<WoqLinearOpContext>>& gate_ctx,
+    const std::vector<c10::intrusive_ptr<WoqLinearOpContext>>& up_ctx,
+    const std::vector<c10::intrusive_ptr<WoqLinearOpContext>>& down_ctx,
+    const at::Tensor& routing_weights,
+    at::Tensor& output,
+    bool is_distributed);
 using mixtral_moe_kernel_fn = at::Tensor (*)(
     const at::Tensor& hidden_states,
     const at::Tensor& top_x,
@@ -75,8 +148,36 @@ using mixtral_moe_kernel_fn = at::Tensor (*)(
     const at::Tensor& routing_weights,
     at::Tensor& output,
     bool is_distributed);
+using deepseek_moe_kernel_fn = at::Tensor (*)(
+    const at::Tensor& hidden_states,
+    const at::Tensor& expert_mask,
+    const std::vector<at::Tensor>& gate_wei,
+    const std::vector<c10::intrusive_ptr<LinearOpContext>>& gate_op_ctx,
+    const std::vector<at::Tensor>& up_wei,
+    const std::vector<c10::intrusive_ptr<LinearOpContext>>& up_op_ctx,
+    const std::vector<at::Tensor>& down_wei,
+    const std::vector<c10::intrusive_ptr<LinearOpContext>>& down_op_ctx,
+    const at::Tensor& routing_weights,
+    at::Tensor& output,
+    bool is_distributed);
+using deepseek_moe_mkl_kernel_fn = at::Tensor (*)(
+    const at::Tensor& hidden_states,
+    const at::Tensor& expert_mask,
+    const std::vector<at::Tensor>& gate_wei,
+    const std::vector<c10::intrusive_ptr<MKLOpContext>>& gate_op_ctx,
+    const std::vector<at::Tensor>& up_wei,
+    const std::vector<c10::intrusive_ptr<MKLOpContext>>& up_op_ctx,
+    const std::vector<at::Tensor>& down_wei,
+    const std::vector<c10::intrusive_ptr<MKLOpContext>>& down_op_ctx,
+    const at::Tensor& routing_weights,
+    at::Tensor& output,
+    bool is_distributed);
 IPEX_DECLARE_DISPATCH(mixtral_moe_tpp_kernel_fn, mixtral_moe_tpp_kernel_stub);
+IPEX_DECLARE_DISPATCH(deepseek_moe_tpp_kernel_fn, deepseek_moe_tpp_kernel_stub);
 IPEX_DECLARE_DISPATCH(mixtral_moe_woq_kernel_fn, mixtral_moe_woq_kernel_stub);
+IPEX_DECLARE_DISPATCH(deepseek_moe_woq_kernel_fn, deepseek_moe_woq_kernel_stub);
 IPEX_DECLARE_DISPATCH(mixtral_moe_kernel_fn, mixtral_moe_kernel_stub);
+IPEX_DECLARE_DISPATCH(deepseek_moe_kernel_fn, deepseek_moe_kernel_stub);
+IPEX_DECLARE_DISPATCH(deepseek_moe_mkl_kernel_fn, deepseek_moe_mkl_kernel_stub);
 } // namespace cpu
 } // namespace torch_ipex
@@ -400,12 +400,14 @@ def get_repo_root(model_name_or_path):
 
         def get_checkpoint_files(model_name_or_path):
             cached_repo_dir = get_repo_root(model_name_or_path)
-
+            glob_pattern = "*.[bp][it][n]"
+            if re.search("deepseek-v2", model_name_or_path, re.IGNORECASE):
+                glob_pattern = "*.[sbp][ait][fn][e][t][e][n][s][o][r][s]"
             # extensions: .bin | .pt
             # creates a list of paths from all downloaded files in cache dir
             file_list = [
                 str(entry)
-                for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]")
+                for entry in Path(cached_repo_dir).rglob(glob_pattern)
                 if entry.is_file()
             ]
             return file_list