Update permute removal pass to handle binary operations, and cleanup better (#18256)

mcremon-meta · meta-codesync[bot] · commit 7fdb60e7655d · 2026-03-18T21:35:48.000-07:00
Summary: Pull Request resolved: #18256 As titled. It is currently not cleaning up as much as it should, and the pass is only capable of handling single input cases. Result: from 9 to 1 (minimum by construction) permutes on Wake Gesture. Differential Revision: D96940254 Reviewed By: abeakkas
diff --git a/backends/cadence/aot/BUCK b/backends/cadence/aot/BUCK
@@ -300,6 +300,7 @@ fbcode_target(_kind = runtime.python_library,
     ],
     typing = True,
     deps = [
+        ":fuse_ops",
         ":ops_registrations",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot:pass_utils",
diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
@@ -1170,7 +1170,10 @@ def can_fuse_for_chain(
             return False
 
         # checking that permut2(permut1(identity)) == identity, modulo unitary dimensions
-        input_shape = cast(torch.fx.Node, producer.args[0]).meta["val"].shape
+        producer_input = cast(torch.fx.Node, producer.args[0])
+        if "val" not in producer_input.meta:
+            return False
+        input_shape = producer_input.meta["val"].shape
         ident_dims = list(range(len(input_shape)))
         # this mapping helps to handle both transpose and permutations
         f: dict[Any, Callable] = {
diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py
@@ -25,6 +25,7 @@
 from executorch.backends.cadence.aot.remove_ops import (
     CadenceRemoveNops,
     RemoveNopSliceOrViewOpPass,
+    RemovePermutesAroundElementwiseOps,
     RemoveRedundantOps,
 )
 from executorch.backends.cadence.aot.reorder_ops import CadenceReorderOpsInGraph
@@ -89,6 +90,7 @@ def get_passes_in_default_order() -> list[Type[ExportPass]]:
         CadenceSimplifyOpsInGraph.passes,
         FinalizePipeline,
         FuseFullThenReshapePass,
+        RemovePermutesAroundElementwiseOps,
         FuseTransposeOrPermuteOpPairsPass,
         RemoveNopSliceOrViewOpPass,
         CompileTimeTypeDispatchPass,
diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py
@@ -14,14 +14,15 @@
 
 import torch
 import torch.fx
+
+from executorch.backends.cadence.aot.fuse_ops import FuseTransposeOrPermuteOpPairsPass
 from executorch.backends.cadence.aot.pass_utils import (
     CadencePassAttribute,
     get_arg,
     register_cadence_pass,
     RemoveOrReplacePassInterface,
     set_arg,
 )
-
 from executorch.backends.cadence.aot.simplify_ops import SimplifySliceOpPass
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
@@ -412,6 +413,9 @@ class Subgraph:
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
         exir_ops.edge.cadence.quantize_per_tensor.default,
         exir_ops.edge.cadence.dequantize_per_tensor.default,
+        exir_ops.edge.cadence.quantized_relu.per_tensor,
+        exir_ops.edge.cadence.requantize.per_tensor,
+        exir_ops.edge.cadence.quantized_add.per_tensor,
         # Ops that require special handling.
         exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.mean.dim,
@@ -804,6 +808,7 @@ class CommonRemovePasses:
         RemoveToOpsPass,
         RemoveZeroSizedCatArgsPass,
         RemovePermutesAroundElementwiseOps,
+        FuseTransposeOrPermuteOpPairsPass,
         RemoveSqueezeViewBeforeElementwiseOps,
         RemoveCatFromSliceCopyPass,
         RemoveCloneOpsTransformImported,