pytorch · winskuo-quic · Feb 25, 2026
diff --git a/.ci/scripts/test_qnn_static_llm.sh b/.ci/scripts/test_qnn_static_llm.sh
@@ -47,11 +47,11 @@ if [[ "${TASK_NAME}" == "stories_110m" ]]; then
     $PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
 
     # Compile only as weight sharing is not applicable on x86.
-    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --soc_model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
     exit_code1=$?
 
     # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
-    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --soc_model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
     exit_code2=$?
 
     # Check the exit codes and print messages
@@ -84,7 +84,7 @@ elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then
     if [ -n "$2" ]; then
         EXTRA_FLAGS="$EXTRA_FLAGS --static_llm_eval_method $2"
     fi
-    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 $EXTRA_FLAGS
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --soc_model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 $EXTRA_FLAGS
     exit_code1=$?
     if [ $exit_code1 -ne 0 ]; then
         exit 1

@@ -1,5 +1,7 @@
 import os
 
+import torch
+
 from .scripts.download_qnn_sdk import install_qnn_sdk, is_linux_x86
 
 
@@ -11,3 +13,4 @@
     ok = install_qnn_sdk()
     if not ok:
         raise RuntimeError("Failed to install QNN SDK. Please check the logs above.")
+torch.backends.mkldnn.enabled = False
@@ -27,11 +27,11 @@ touch ${llama_artifacts}/params.json
 echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > ${llama_artifacts}/params.json
 
 # Checks e2e accuracy
-expected=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts $llama_artifacts --enable_x86_64 | grep "Model CI result:")
+expected=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --soc_model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts $llama_artifacts --enable_x86_64 | grep "Model CI result:")
 exit_code1=$?
 
 # Checks accuracy with precompiled
-output=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir $PTE_ARTIFACT --llama_artifacts $llama_artifacts --enable_x86_64 --pre_gen_pte $PTE_ARTIFACT | grep "Model CI result:")
+output=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --soc_model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir $PTE_ARTIFACT --llama_artifacts $llama_artifacts --enable_x86_64 --pre_gen_pte $PTE_ARTIFACT | grep "Model CI result:")
 exit_code2=$?
 
 if [[ "$output" == "$expected" ]]; then

@@ -41,12 +41,11 @@ class MyModel(torch.nn.Module):
 ```
 At the time we try to lower it with Qualcomm backend:
 ```python
-from executorch.examples.qualcomm.utils import build_executorch_binary
+from executorch.backends.qualcomm.export_utils import build_executorch_binary
 
 build_executorch_binary(
     model=MyModel(),
-    inputs=(torch.randn(200, 768),),
-    soc_model="SM8650"
+    qnn_config=qnn_config,
     file_name="my_model",
     dataset=None,
 )

@@ -31,13 +31,14 @@ To enable model visualization, please add the `--online_prepare` flag.
 ## Details
 ### 1. Lower to QNN backend
 Generate an ExecuTorch binary for Qualcomm platforms.
+Ensure that qnn_config.profile_level is set to 3, which will generate op_trace.
 ```python
+qnn_config.profile_level = 3
 build_executorch_binary(
-    model,
-    example_input,
-    args.model,
-    f"{args.artifact}/{pte_filename}",
-    [example_input],
+    model=model,
+    qnn_config=qnn_config,
+    file_name=f"{args.artifact}/{pte_filename}",
+    dataset=[example_input],
     quant_dtype=QuantDtype.use_8a8w,
     online_prepare=args.online_prepare,
     optrace=True,
@@ -47,14 +48,9 @@ build_executorch_binary(
 Generate optrace and QHAS files using QNN tools under $QNN_SDK_ROOT. After finishing, you will get a `binaries_trace` dictionary.
 ``` python
 adb = SimpleADB(
-    qnn_sdk=os.getenv("QNN_SDK_ROOT"),
-    build_path=f"{args.build_folder}",
+    qnn_config=qnn_config,
     pte_path=f"{args.artifact}/{pte_filename}.pte",
-    workspace=f"/data/local/tmp/executorch/{pte_filename}",
-    device_id=args.device,
-    host_id=args.host,
-    soc_model=args.model,
-    target=args.target,
+    workspace=f"/data/local/tmp/executorch/{pte_filename},
 )
 binaries_trace = generate_optrace(
     args, adb, f"{args.artifact}/{pte_filename}.pte", example_input
@@ -139,42 +135,23 @@ When executing the script, please add the flag `--dump_intermediate_outputs`. Th
 Initialize a `QNNIntermediateDebugger`. Please pass initialized `QNNIntermediateDebugger` and the `args.dump_intermediate_outputs` to `build_executorch_binary` method as well.
 #### Example:
 ```python
-from executorch.examples.qualcomm.utils import build_executorch_binary
+from executorch.backends.qualcomm.export_utils import build_executorch_binary
 from executorch.backends.qualcomm.debugger.qnn_intermediate_debugger import QNNIntermediateDebugger
 
 qnn_intermediate_debugger = QNNIntermediateDebugger()
 build_executorch_binary(
     model=MyModel(),
-    inputs=(torch.randn(200, 768),),
-    soc_model="SM8650",
+    qnn_config=qnn_config,
     file_name="my_model",
     dataset=my_dataset,
-    dump_intermediate_outputs=args.dump_intermediate_outputs, # Add this flag
-    qnn_intermediate_debugger=qnn_intermediate_debugger, # Add this flag
+    qnn_intermediate_debugger=qnn_intermediate_debugger, # Provide this param
 )
 ```
 
 ### 4. Set data num to 1
 It is perfectly fine for users to pass the desired amount of datasets to `build_executorch_binary`, which helps achieve better quantization results. However, after `build_executorch_binary` is called, we need to ensure that we only perform one inference during execution. Please ensure that CPU and QNN is using the same input during execution; otherwise, the debugging results might not be accurate.
 
-### 5. Pass flag to SimpleADB
-When creating `SimpleADB`, please also pass the flag `args.dump_intermediate_outputs`. This tells the runner to create files that store the intermediate output schema and binary data.
-#### Example:
-```python
-adb = SimpleADB(
-    qnn_sdk=os.getenv("QNN_SDK_ROOT"),
-    build_path=f"{args.build_folder}",
-    pte_path=f"{args.artifact}/{pte_filename}.pte",
-    workspace=f"/data/local/tmp/executorch/{pte_filename}",
-    device_id=args.device,
-    host_id=args.host,
-    soc_model=args.model,
-    shared_buffer=args.shared_buffer,
-    dump_intermediate_outputs=args.dump_intermediate_outputs, # Add this flag
-)
-```
-
-### 6: Pull and process the results.
+### 5: Pull and process the results.
 After QNN execution with the runner, if the previous steps are done correctly, we should be able to get two files: `etdump.etdp` and `debug_output.bin`.
 The following example pulls the files back and calls a callback function to process the results. In this callback function, we create the `Inspector`. Then we perform CPU inference to get CPU intermediate results. Now, we have both QNN and CPU intermediate results, we can start generating results to compare the accuracy. Taking the following example, we should be able to get `debug_graph.svg` as an output in the current directory.
 #### Example: