NXP backend: Add pre-processing pass to move view_copy nodes into their own QDQ clusters.

MartinPavella · MartinPavella · commit 5daaed47162b · 2025-08-07T15:34:41.000+02:00
A Pytorch model can contain a `Linear` operator with 4D IO. After quantization, it gets its own QDQ cluster. But after lowering to edge, `view_copy` operators are added within the cluster, right before and after the `Linear` (now `addmm`/`mm`). This does not follow the QDQ schema and causes issues later down the pipeline. Therefore, pre-processing passes at the edge dialect level were implemented, to move the `view_copy` nodes into their own QDQ clusters.
diff --git a/backends/nxp/edge_passes/move_auxiliary_operator_into_separate_qdq_cluster_pass.py b/backends/nxp/edge_passes/move_auxiliary_operator_into_separate_qdq_cluster_pass.py
@@ -0,0 +1,219 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from executorch.backends.nxp.edge_passes.neutron_edge_pass import NeutronEdgePass
+from executorch.backends.nxp.neutron_partitioner import QDQClusterRecognizer
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.fx import Node
+from torch.fx.passes.infra.pass_base import PassResult
+
+
+def insert_qdq_pair_after_node(
+    graph: torch.fx.Graph, anchor: torch.fx.Node, q_params: tuple
+):
+    # Insert a Quantize node.
+    with graph.inserting_after(anchor):
+        quantize_op = graph.create_node(
+            op="call_function",
+            target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(),  # Will be added later.
+        )
+        quantize_op.meta = anchor.meta
+
+    # Insert a Dequantize node.
+    with graph.inserting_after(quantize_op):
+        dequantize_op = graph.create_node(
+            op="call_function",
+            target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(quantize_op,) + q_params,
+        )
+        dequantize_op.meta = quantize_op.meta
+    anchor.replace_all_uses_with(dequantize_op)
+
+    # Add this at the end, so the `anchor.replace_all_uses_with(dequantize_op)` does not replace the first use of the
+    #  `quantize_op`.
+    quantize_op.args = (anchor,) + q_params
+
+
+def _is_dequantize(node_: Node) -> bool:
+    return (
+        node_.op == "call_function"
+        and node_.target
+        == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+    )
+
+
+def _is_quantize(node_: Node) -> bool:
+    return (
+        node_.op == "call_function"
+        and node_.target
+        == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+    )
+
+
+class MoveLeadingAuxiliaryOperatorIntoSeparateQDQClusterPass(NeutronEdgePass):
+    """
+                                                           │
+                                                     ┌─────▼──────┐
+                │                                    │ dequantize │
+          ┌─────▼──────┐                             └─────┬──────┘
+          │ dequantize │                             ┌─────▼──────┐
+          └─────┬──────┘                             │ <aux_node> │
+          ┌─────▼──────┐                             └─────┬──────┘
+          │ <aux_node> │                              ┌────▼─────┐            ┐
+          └─────┬──────┘                              │ quantize │            │
+     ┌──────────▼──────────┐       replaced with      └────┬─────┘            │
+    ⋯┤ <main_cluster_node> ├⋯     ──────────────►          │                  │ newly added nodes
+     └──────────┬──────────┘                         ┌─────▼──────┐           │
+                ▼                                    │ dequantize │           │
+                ⋮                                    └─────┬──────┘           ┘
+           ┌────▼─────┐                         ┌──────────▼──────────┐
+           │ quantize │                        ⋯┤ <main_cluster_node> ├⋯
+           └────┬─────┘                         └──────────┬──────────┘
+                ▼                                          ▼
+                                                           ⋮
+                                                      ┌────▼─────┐
+                                                      │ quantize │
+                                                      └────┬─────┘
+                                                           ▼
+    """
+
+    allowed_auxiliary_nodes = [exir_ops.edge.aten.view_copy.default]
+
+    # List of approved nodes to which the <aux_node> can be connected in order for the pass to make the modification.
+    allowed_main_cluster_nodes = [
+        exir_ops.edge.aten.addmm.default,
+        exir_ops.edge.aten.mm.default,
+    ]
+
+    def run(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for aux_node in graph_module.graph.nodes:
+            if (
+                aux_node.op != "call_function"
+                or aux_node.target not in self.allowed_auxiliary_nodes
+            ):
+                continue
+
+            dequantize_node = aux_node.args[0]
+            if not _is_dequantize(dequantize_node):
+                # Not the intended use case.
+                continue
+
+            users = list(aux_node.users.keys())
+            if len(users) != 1:
+                # Not the intended use case.
+                continue
+
+            main_cluster_node = users[0]
+            if (
+                main_cluster_node.op != "call_function"
+                or main_cluster_node.target not in self.allowed_main_cluster_nodes
+            ):
+                # Unsupported `main_cluster_node`.
+                continue
+
+            # Make sure the nodes are part of the same QDQ cluster.
+            cluster = QDQClusterRecognizer().get_qdq_cluster(main_cluster_node)
+            if any(
+                node_ not in cluster
+                for node_ in [dequantize_node, aux_node, main_cluster_node]
+            ):
+                continue
+
+            # ---- The nodes follow the pattern described in the header. ----
+
+            q_params = dequantize_node.args[1:]
+            insert_qdq_pair_after_node(graph_module.graph, aux_node, q_params)
+
+            # The graph has now changed, and we shouldn't keep iterating through it. Return the new graph and the parent
+            #  class will call this pass again.
+            return PassResult(graph_module, True)
+
+        # Nothing was changed.
+        return PassResult(graph_module, False)
+
+
+class MoveTrailingAuxiliaryOperatorIntoSeparateQDQClusterPass(NeutronEdgePass):
+    """
+                                                            │
+                                                      ┌─────▼──────┐
+                │                                     │ dequantize │
+          ┌─────▼──────┐                              └─────┬──────┘
+          │ dequantize │                                    ⋮
+          └─────┬──────┘                         ┌──────────▼──────────┐
+                ▼                               ⋯┤ <main_cluster_node> ├⋯
+                ⋮                                └──────────┬──────────┘
+     ┌──────────▼──────────┐       replaced with       ┌────▼─────┐            ┐
+    ⋯┤ <main_cluster_node> ├⋯     ──────────────►      │ quantize │            │
+     └──────────┬──────────┘                           └────┬─────┘            │
+          ┌─────▼──────┐                                    │                  │ newly added nodes
+          │ <aux_node> │                              ┌─────▼──────┐           │
+          └─────┬──────┘                              │ dequantize │           │
+           ┌────▼─────┐                               └─────┬──────┘           ┘
+           │ quantize │                               ┌─────▼──────┐
+           └────┬─────┘                               │ <aux_node> │
+                ▼                                     └─────┬──────┘
+                                                       ┌────▼─────┐
+                                                       │ quantize │
+                                                       └────┬─────┘
+                                                            ▼
+    """
+
+    allowed_auxiliary_nodes = [exir_ops.edge.aten.view_copy.default]
+
+    # List of approved nodes to which the `<aux_node>` can be connected in order for the pass to make the modification.
+    allowed_main_cluster_nodes = [
+        exir_ops.edge.aten.addmm.default,
+        exir_ops.edge.aten.mm.default,
+    ]
+
+    def run(self, graph_module: torch.fx.GraphModule) -> PassResult:
+
+        for aux_node in graph_module.graph.nodes:
+            if (
+                aux_node.op != "call_function"
+                or aux_node.target not in self.allowed_auxiliary_nodes
+            ):
+                continue
+
+            main_cluster_node = aux_node.args[0]
+            if (
+                main_cluster_node.op != "call_function"
+                or main_cluster_node.target not in self.allowed_main_cluster_nodes
+            ):
+                # Unsupported `main_cluster_node`.
+                continue
+
+            users = list(aux_node.users.keys())
+            if len(users) != 1:
+                # Not the intended use case.
+                continue
+
+            quantize_node = users[0]
+            if not _is_quantize(quantize_node):
+                # Not the intended use case.
+                continue
+
+            # Make sure the nodes are part of the same QDQ cluster.
+            cluster = QDQClusterRecognizer().get_qdq_cluster(main_cluster_node)
+            if any(
+                node_ not in cluster
+                for node_ in [quantize_node, aux_node, main_cluster_node]
+            ):
+                continue
+
+            # ---- The nodes follow the pattern described in the header. ----
+
+            q_params = quantize_node.args[1:]
+            insert_qdq_pair_after_node(graph_module.graph, main_cluster_node, q_params)
+
+            # The graph has now changed, and we shouldn't keep iterating through it. Return the new graph and the parent
+            #  class will call this pass again.
+            return PassResult(graph_module, True)
+
+        # Nothing was changed.
+        return PassResult(graph_module, False)
diff --git a/backends/nxp/edge_passes/neutron_edge_pass.py b/backends/nxp/edge_passes/neutron_edge_pass.py
@@ -7,17 +7,17 @@
 from abc import abstractmethod
 
 import torch
-from torch.fx.passes.infra.pass_base import PassResult
 
 from executorch.exir.pass_base import ExportPass
+from torch.fx.passes.infra.pass_base import PassResult
 
 
 class NeutronEdgePass(ExportPass):
-    """ Abstract parent class for pre-processing passes on the edge dialect level. """
+    """Abstract parent class for pre-processing passes on the edge dialect level."""
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        """ Call `self.run()` as long as changes are being made. After a pass modifies the graph, it cannot keep on
-             iterating through its nodes, and must return. This method allows the pass to go through the whole model.
+        """Call `self.run()` as long as changes are being made. After a pass modifies the graph, it cannot keep on
+        iterating through its nodes, and must return. This method allows the pass to go through the whole model.
         """
 
         # Every pass will return once it makes a change to the graph, to avoid traversing and modifying a graph at the
@@ -36,19 +36,20 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 return PassResult(graph_module, modified)
 
         # Iteration limit was reached.
-        logging.warning(f'The NeutronEdgePass `{self.__class__.__name__}` reached the iteration limit.')
+        logging.warning(
+            f"The NeutronEdgePass `{self.__class__.__name__}` reached the iteration limit."
+        )
         graph_module = self.recompile_module(graph_module)
         return PassResult(graph_module, modified)
 
     @abstractmethod
     def run(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        """ Child classes should implement their graph modification here. """
+        """Child classes should implement their graph modification here."""
         pass
 
     def recompile_module(
         self, graph_module: torch.fx.GraphModule
     ) -> torch.fx.GraphModule:
-        """ Recompile the graph and re-trace the metadata. This should ensure that the datatypes and shapes are correct.
-        """
+        """Recompile the graph and re-trace the metadata. This should ensure that the datatypes and shapes are correct."""
         graph_module.recompile()
         return super().call(graph_module).graph_module
diff --git a/backends/nxp/edge_passes/neutron_edge_pass_manager.py b/backends/nxp/edge_passes/neutron_edge_pass_manager.py
@@ -5,33 +5,38 @@
 
 import copy
 
-from torch import nn
-from torch.export import ExportedProgram
-from torch.fx.passes.infra.pass_base import PassResult
-from torch.fx.passes.infra.pass_manager import PassManager
-
 from executorch.backends.nxp.edge_passes.move_auxiliary_operator_into_separate_qdq_cluster_pass import (
     MoveLeadingAuxiliaryOperatorIntoSeparateQDQClusterPass,
     MoveTrailingAuxiliaryOperatorIntoSeparateQDQClusterPass,
 )
 from executorch.backends.nxp.edge_passes.neutron_edge_pass import NeutronEdgePass
 from executorch.exir import EdgeProgramManager
-from executorch.exir.program._program import _get_updated_graph_signature, _get_updated_range_constraints
+from executorch.exir.program._program import (
+    _get_updated_graph_signature,
+    _get_updated_range_constraints,
+)
+
+from torch import nn
+from torch.export import ExportedProgram
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.infra.pass_manager import PassManager
 
 
 class NeutronEdgePassManager(PassManager):
 
     def __init__(self, passes: list[NeutronEdgePass] = None):
         passes: list[NeutronEdgePass] = passes or [
+            MoveLeadingAuxiliaryOperatorIntoSeparateQDQClusterPass(),
+            MoveTrailingAuxiliaryOperatorIntoSeparateQDQClusterPass(),
         ]
 
         super().__init__(
             passes,
-            steps=10  # Empirical value. At most 10 cycles of passes will be run.
+            steps=10,  # Empirical value. At most 10 cycles of passes will be run.
         )
 
     def _transform_graph_module(self, module: nn.Module) -> PassResult:
-        """ Apply the passes to a single graph module. """
+        """Apply the passes to a single graph module."""
         pass_result: PassResult = super().__call__(module)
 
         graph_module = pass_result.graph_module
@@ -41,7 +46,7 @@ def _transform_graph_module(self, module: nn.Module) -> PassResult:
         return pass_result
 
     def __call__(self, epm: EdgeProgramManager) -> EdgeProgramManager:
-        """ Apply the passes to all graph modules in the edge program. """
+        """Apply the passes to all graph modules in the edge program."""
         new_programs: dict[str, ExportedProgram] = {}
 
         for name, program in epm._edge_programs.items():
@@ -56,7 +61,9 @@ def __call__(self, epm: EdgeProgramManager) -> EdgeProgramManager:
                         program.graph_signature, pass_result.graph_module
                     ),
                     state_dict=program.state_dict,
-                    range_constraints=_get_updated_range_constraints(pass_result.graph_module),
+                    range_constraints=_get_updated_range_constraints(
+                        pass_result.graph_module
+                    ),
                     module_call_graph=copy.deepcopy(program._module_call_graph),
                     example_inputs=program.example_inputs,
                     constants=program.constants,
@@ -77,4 +84,6 @@ def __call__(self, epm: EdgeProgramManager) -> EdgeProgramManager:
 
         else:
             # Return a new EdgeProgramManager with the updated programs.
-            return EdgeProgramManager(new_programs, copy.deepcopy(epm._config_methods), epm.compile_config)
+            return EdgeProgramManager(
+                new_programs, copy.deepcopy(epm._config_methods), epm.compile_config
+            )
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
@@ -4,11 +4,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-from torch import nn
-from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 from executorch import exir
-from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import NeutronEdgePassManager
+from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
+    NeutronEdgePassManager,
+)
 from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
 from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
 from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
@@ -19,6 +19,8 @@
     ExecutorchProgramManager,
 )
 from executorch.extension.export_util.utils import export_to_edge
+from torch import nn
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
 def _quantize_model(model, calibration_inputs: list[tuple[torch.Tensor]]):
diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py
@@ -125,6 +125,24 @@ def forward(self, x):
         return x
 
 
+class ConvFCFCSoftmaxModuleWithoutReshape(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.conv = torch.nn.Conv2d(4, 5, 2, bias=False)
+        self.fc1 = torch.nn.Linear(32, 16)
+        self.fc2 = torch.nn.Linear(16, 8)
+        self.softmax = torch.nn.Softmax(1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = self.softmax(x)
+
+        return x
+
+
 class ConstantPadNDModule(torch.nn.Module):
     def __init__(self, paddings: Collection[int], constant: float | int | None = None):
         super().__init__()
diff --git a/backends/nxp/tests/test_batch_norm_fusion.py b/backends/nxp/tests/test_batch_norm_fusion.py
diff --git a/backends/nxp/tests/test_edge_passes.py b/backends/nxp/tests/test_edge_passes.py