Qualcomm AI Engine Direct - GA Static Phi-4-mini (#13179)

DannyYuyang-quic · web-flow · commit c8a07068c119 · 2025-08-09T21:54:09.000-07:00
### Summary - Support Phi-4-mini-instruct for static llama path - add P-ROPE for phi-4-mini - add EOS tok for Phi-4-mini ### Test plan ``` python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s $DEVICE -m SM8750 --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --ptq 16a8w --decoder_model phi_4_mini --num_sharding 4 ``` cc: @haowhsu-quic, @shewu-quic, @winskuo-quic, @cccclai
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
@@ -6,6 +6,8 @@ This file provides you the instructions to run LLM Decoder model with different
  2. LLAMA3.2 1B
  3. LLAMA3.2 3B
  4. QWEN2.5 0.5B
+ 5. QWEN3 0.6B / 1.7B
+ 6. Phi4-mini-instruct
 
 We offer the following modes to execute the model:
 
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -9,6 +9,9 @@
 from dataclasses import dataclass, field
 from typing import Callable, Dict, Type
 
+from executorch.examples.models.phi_4_mini import (
+    convert_weights as convert_phi_4_mini_weights,
+)
 from executorch.examples.models.qwen2_5 import (
     convert_weights as convert_qwen2_5_weights,
 )
@@ -71,3 +74,14 @@ class Qwen3_1_7B(HFModel):
     )
     runner_version: str = field(default=DECODER_MODEL_VERSION["qwen2_5"])
     convert_weights = convert_qwen3_weights
+
+
+@register_hf_model("phi_4_mini")
+@dataclass(init=False, frozen=True)
+class Phi4Mini(HFModel):
+    repo_id: str = "microsoft/Phi-4-mini-instruct"
+    params_path: str = os.path.join(
+        BASE_DIR, "../../../models/phi_4_mini/config/config.json"
+    )
+    runner_version: str = field(default=DECODER_MODEL_VERSION["phi_4_mini"])
+    convert_weights = convert_phi_4_mini_weights
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -15,4 +15,5 @@
     "stories110m": "llama2",
     "llama3_2": "llama3",
     "qwen2_5": "qwen2_5",
+    "phi_4_mini": "phi_4_mini",
 }
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -579,7 +579,7 @@ def permute(w, heads):
                 annotate_conv=args.ptq != "16a8w",
             ),
         )
-        if args.decoder_model == {"stories110m", "stories260k"}:
+        if args.decoder_model in {"stories110m", "stories260k"}:
             custom_annotations = custom_annotations + (
                 annotate_linear_16a8w_in_affine_layer,
             )
@@ -1175,11 +1175,16 @@ def export_llama(args) -> None:
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]
         tokenizer = get_tokenizer(runtime_tokenizer_path)
+    elif args.decoder_model == "phi_4_mini":
+        model_id = SUPPORTED_HF_MODELS[args.decoder_model].repo_id
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]
+        tokenizer = get_tokenizer(runtime_tokenizer_path)
         with open(runtime_tokenizer_path, "r+") as file:
             data = json.load(file)
             # TODO: Encountered the following error during runtime, so switched behavior for now.
-            # Error: libc++abi: terminating due to uncaught exception of type std::runtime_error: Unsupported Normalizer type: NFC.
-            data.pop("normalizer")
+            # Error: libc++abi: terminating due to uncaught exception of type std::runtime_error: invert=true is not supported for Split PreTokenizer. Only invert=false is supported.
+            data["pre_tokenizer"]["pretokenizers"][-2]["invert"] = False
             file.seek(0)
             json.dump(data, file, indent=4)
             file.truncate()
diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
@@ -39,6 +39,24 @@ def apply_rotary_emb_single(
     return x_out
 
 
+def apply_partial_rotary_emb_single(
+    x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
+) -> torch.Tensor:
+
+    if x.dim() == 4:
+        freqs_cos = freqs_cos[None, :, None, :]
+        freqs_sin = freqs_sin[None, :, None, :]
+
+    rotary_dim = freqs_cos.shape[-1] * 2
+
+    x_rot, x_pass = x[..., :rotary_dim], x[..., rotary_dim:]
+    x_r, x_i = x_rot[..., : x_rot.shape[-1] // 2], x_rot[..., x_rot.shape[-1] // 2 :]
+    x_out_r = x_r * freqs_cos - x_i * freqs_sin
+    x_out_i = x_r * freqs_sin + x_i * freqs_cos
+    x_rotated = torch.cat([x_out_r, x_out_i], dim=-1)
+    return torch.cat([x_rotated, x_pass], dim=-1)
+
+
 class LlamaAttention(nn.Module):
     def __init__(self, config: ModelArgs, output_new_cache_only=False):
         super().__init__()
@@ -60,6 +78,11 @@ def __init__(self, config: ModelArgs, output_new_cache_only=False):
             self.q_norm_fn = torch.nn.RMSNorm(q_norm_dim, eps=config.norm_eps)
             self.k_norm_fn = torch.nn.RMSNorm(k_norm_dim, eps=config.norm_eps)
 
+        if config.partial_rotary_factor < 1:
+            self.apply_rope_emb = apply_partial_rotary_emb_single
+        else:
+            self.apply_rope_emb = apply_rotary_emb_single
+
         self.wq = nn.Linear(
             self.dim,
             self.n_heads * self.head_dim,
@@ -199,17 +222,17 @@ def forward_sha(  # noqa: C901
         for i in range(len(q)):
             if self.use_qk_norm and self.qk_norm_before_rope:
                 q[i] = self.q_norm_fn(q[i])
-            q[i] = apply_rotary_emb_single(q[i], freqs_cos, freqs_sin)
+            q[i] = self.apply_rope_emb(q[i], freqs_cos, freqs_sin)
             if hasattr(self.config, "enable_r3") and self.config.enable_r3:
-                q[i] = torch.matmul(q[i], self.r3_weight.T)
+                q[i] = torch.matmul(q[i], self.r3_weight)
             if self.use_qk_norm and not self.qk_norm_before_rope:
                 q[i] = self.q_norm_fn(q[i])
         for i in range(len(k)):
             if self.use_qk_norm and self.qk_norm_before_rope:
                 k[i] = self.k_norm_fn(k[i])
-            k[i] = apply_rotary_emb_single(k[i], freqs_cos, freqs_sin).transpose(1, 2)
+            k[i] = self.apply_rope_emb(k[i], freqs_cos, freqs_sin).transpose(1, 2)
             if hasattr(self.config, "enable_r3") and self.config.enable_r3:
-                k[i] = torch.matmul(k[i], self.r3_weight.T)
+                k[i] = torch.matmul(k[i], self.r3_weight)
             if self.use_qk_norm and not self.qk_norm_before_rope:
                 k[i] = self.k_norm_fn(k[i])
 
@@ -272,8 +295,8 @@ def forward(
             q = self.q_norm_fn(q)
             k = self.k_norm_fn(k)
 
-        q = apply_rotary_emb_single(q, freqs_cos, freqs_sin)
-        k = apply_rotary_emb_single(k, freqs_cos, freqs_sin).permute(0, 2, 3, 1)
+        q = self.apply_rope_emb(q, freqs_cos, freqs_sin)
+        k = self.apply_rope_emb(k, freqs_cos, freqs_sin).permute(0, 2, 3, 1)
 
         if self.use_qk_norm and not self.qk_norm_before_rope:
             q = self.q_norm_fn(q)
@@ -368,7 +391,8 @@ def __init__(self, config: ModelArgs, output_new_cache_only=False):
         super().__init__()
         self.dim = config.dim
         self.attention = LlamaAttention(
-            config=config, output_new_cache_only=output_new_cache_only
+            config=config,
+            output_new_cache_only=output_new_cache_only,
         )
         self.feed_forward = FeedForward(config)
         self.attention_norm = torch.nn.RMSNorm(config.dim, eps=config.norm_eps)
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -9,8 +9,8 @@
 /**
  * @file
  *
- * This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B with Qualcomm
- * AI Engine Direct.
+ * This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B, Qwen3 0.6B
+ * / 1.7B phi4-mini-instruct with Qualcomm AI Engine Direct.
  *
  */
 
@@ -104,6 +104,16 @@ std::string get_formatted_prompt(
     case example::DecoderModelVersion::kQwen2_5:
       formatted_prompt.append(prompt);
       break;
+    case example::DecoderModelVersion::kPhi4:
+      if (!system_prompt.empty()) {
+        formatted_prompt.append("<|system|>");
+        formatted_prompt.append(system_prompt);
+        formatted_prompt.append("<|end|>");
+      }
+      formatted_prompt.append("<|user|>");
+      formatted_prompt.append(prompt);
+      formatted_prompt.append("<|end|><|assistant|>");
+      break;
     case example::DecoderModelVersion::kLlama3:
       if (!system_prompt.empty()) {
         formatted_prompt.append(
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -130,6 +130,8 @@ Runner::Runner(
     decoder_model_version_ = DecoderModelVersion::kLlama3;
   } else if (decoder_model_version == "qwen2_5") {
     decoder_model_version_ = DecoderModelVersion::kQwen2_5;
+  } else if (decoder_model_version == "phi_4_mini") {
+    decoder_model_version_ = DecoderModelVersion::kPhi4;
   } else {
     ET_CHECK_MSG(false, "Unsupported Decoder Model");
   }
@@ -185,6 +187,8 @@ Error Runner::load() {
   }
   if (decoder_model_version_ == DecoderModelVersion::kLlama3) {
     eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
+  } else if (decoder_model_version_ == DecoderModelVersion::kPhi4) {
+    eos_ids->insert(tokenizer_->encode("<|end|>", 0, 0).get()[0]);
   }
   // Try avoid getMetadataHelper as it is time consuming.
   Result<MethodMeta> method_meta =
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -31,6 +31,7 @@ enum DecoderModelVersion {
   kLlama2 = 0,
   kLlama3,
   kQwen2_5,
+  kPhi4,
 };
 class Runner {
  public:

Original file line number	Diff line number	Diff line change
`@@ -15,4 +15,5 @@`
`15`	`15`	`"stories110m": "llama2",`
`16`	`16`	`"llama3_2": "llama3",`
`17`	`17`	`"qwen2_5": "qwen2_5",`
	`18`	`+ "phi_4_mini": "phi_4_mini",`
`18`	`19`	`}`
Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,8 @@ Runner::Runner(`
`130`	`130`	`decoder_model_version_ = DecoderModelVersion::kLlama3;`
`131`	`131`	`} else if (decoder_model_version == "qwen2_5") {`
`132`	`132`	`decoder_model_version_ = DecoderModelVersion::kQwen2_5;`
	`133`	`+ } else if (decoder_model_version == "phi_4_mini") {`
	`134`	`+ decoder_model_version_ = DecoderModelVersion::kPhi4;`
`133`	`135`	`} else {`
`134`	`136`	`ET_CHECK_MSG(false, "Unsupported Decoder Model");`
`135`	`137`	`}`
`@@ -185,6 +187,8 @@ Error Runner::load() {`
`185`	`187`	`}`
`186`	`188`	`if (decoder_model_version_ == DecoderModelVersion::kLlama3) {`
`187`	`189`	`eos_ids->insert(tokenizer_->encode("<\|eot_id\|>", 0, 0).get()[0]);`
	`190`	`+ } else if (decoder_model_version_ == DecoderModelVersion::kPhi4) {`
	`191`	`+ eos_ids->insert(tokenizer_->encode("<\|end\|>", 0, 0).get()[0]);`
`188`	`192`	`}`
`189`	`193`	`// Try avoid getMetadataHelper as it is time consuming.`
`190`	`194`	`Result<MethodMeta> method_meta =`