diff --git a/backends/arm/test/misc/BUCK b/backends/arm/test/misc/BUCK index 9284a0a72b0..fe2d97f03ce 100644 --- a/backends/arm/test/misc/BUCK +++ b/backends/arm/test/misc/BUCK @@ -15,3 +15,15 @@ fbcode_target(_kind = runtime.python_library, "//executorch/backends/arm/test:arm_tester_lib", ], ) + +fbcode_target(_kind = runtime.python_binary, + name = "test_conv_to_fc_cycles", + srcs = ["test_conv_to_fc_cycles.py"], + main_function = "executorch.backends.arm.test.misc.test_conv_to_fc_cycles.main", + deps = [ + "//executorch/backends/arm:ethosu", + "//executorch/backends/arm:_factory", + "//executorch/backends/arm/quantizer:lib", + "//executorch/exir:lib", + ], +) diff --git a/backends/arm/test/misc/test_conv_to_fc_cycles.py b/backends/arm/test/misc/test_conv_to_fc_cycles.py new file mode 100644 index 00000000000..c29aa357ecf --- /dev/null +++ b/backends/arm/test/misc/test_conv_to_fc_cycles.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Verify cycle improvement from convert_conv_to_fc optimization (D96932767). + +Compiles nn.Linear layers through the ExecuTorch ARM backend to TOSA, +then runs Vela/Regor with --verbose-performance to show per-layer cycle +counts. The optimization converts 1x1 Conv2D (produced by DecomposeLinearPass) +to FullyConnected, switching from ConvolutionMxN to VectorProduct NPU block. + +Usage (must be on a Linux devserver — Regor requires Linux): + + buck run fbcode//executorch/backends/arm/test/misc:test_conv_to_fc_cycles + +To compare WITHOUT the optimization (baseline), temporarily comment out +`convert_conv_to_fc` from op_rewrite_list in tosa_graph_optimiser.py and +revert the graphir_optimiser.cpp changes, then re-run. + +Expected output with optimization: + NNG Operator = FullyConnected, Op Cycles ~ 1,341, Util% (MAC) ~ 5% + +Expected output WITHOUT optimization: + NNG Operator = Conv2D, Op Cycles ~ 9,858, Util% (MAC) ~ 0.67% +""" + +import tempfile + +import torch +import torch.nn as nn + +from executorch.backends.arm.ethosu import EthosUCompileSpec +from executorch.backends.arm.quantizer import ( + EthosUQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.util._factory import create_partitioner +from executorch.exir import to_edge_transform_and_lower +from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e + + +class SimpleLinear(nn.Module): + """Single nn.Linear layer — DecomposeLinearPass converts this to 1x1 Conv2D.""" + + def __init__(self, in_features: int = 128, out_features: int = 64): + super().__init__() + self.fc = nn.Linear(in_features, out_features, bias=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.fc(x) + + +def compile_and_report(in_features: int = 128, out_features: int = 64): + """Export, quantize, lower to TOSA, compile with Vela, print performance.""" + model = SimpleLinear(in_features, out_features).eval() + example_input = (torch.randn(1, in_features),) + + # Export + exported = torch.export.export(model, example_input, strict=True) + + # Quantize + quantizer = EthosUQuantizer() + quantizer.set_global(get_symmetric_quantization_config(is_per_channel=True)) + prepared = prepare_pt2e(exported, quantizer) + prepared(*example_input) + quantized = convert_pt2e(prepared) + + # Build compile spec with verbose-performance and intermediate artifact dump + tmpdir = tempfile.mkdtemp(prefix="conv_fc_cycles_") + compile_spec = ( + EthosUCompileSpec( + "ethos-u55-128", + system_config="Ethos_U55_High_End_Embedded", + memory_mode="Shared_Sram", + extra_flags=["--arena-cache-size=2097152", "--verbose-performance"], + ) + .dump_intermediate_artifacts_to(tmpdir) + ) + + partitioner = create_partitioner(compile_spec) + + print(f"Intermediate artifacts: {tmpdir}") + print(f"Model: nn.Linear({in_features}, {out_features})") + print() + + # Lower and compile — Vela verbose-performance output goes to stdout + print("=" * 70) + print("Vela --verbose-performance output:") + print("=" * 70) + to_edge_transform_and_lower( + quantized, + partitioner=[partitioner], + ) + print("=" * 70) + print() + print("Look for the Conv2D/FullyConnected layer in the table above.") + print(" WITH optimization: NNG Operator = FullyConnected, ~1,341 cycles") + print(" WITHOUT optimization: NNG Operator = Conv2D, ~9,858 cycles") + + +def main(): + compile_and_report() + + +if __name__ == "__main__": + main()